diff --git a/Grid/GridCore.h b/Grid/GridCore.h index a48d2d49..2209f960 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -47,9 +47,9 @@ Author: paboyle #include #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 16cfcf50..ecb561ea 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -6,6 +6,7 @@ /////////////////// #include #include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9d779e05..9556c03d 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -18,19 +18,20 @@ #pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__NVCC__") #pragma push_macro("__CUDACC__") +#undef __CUDA_ARCH__ #undef __NVCC__ #undef __CUDACC__ -#undef __CUDA_ARCH__ #define __NVCC__REDEFINE__ #endif /* SYCL save and restore compile environment*/ -#ifdef __SYCL_DEVICE_ONLY__ +#ifdef GRID_SYCL #pragma push #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ -#undef EIGEN_USE_SYCL #define EIGEN_DONT_VECTORIZE +//#undef EIGEN_USE_SYCL +#define __SYCL__REDEFINE__ #endif @@ -41,7 +42,7 @@ #ifdef __NVCC__REDEFINE__ #pragma pop_macro("__CUDACC__") #pragma pop_macro("__NVCC__") -#pragma pop_macro("__CUDA_ARCH__") +#pragma pop_macro("GRID_SIMT") #pragma pop #endif diff --git a/Grid/Makefile.am b/Grid/Makefile.am index b88ea4f2..f1fa462e 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,7 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif -all: version-cache +all: version-cache Version.h version-cache: @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @@ -42,7 +42,7 @@ version-cache: fi;\ rm -f vertmp -Version.h: +Version.h: version-cache cp version-cache Version.h .PHONY: version-cache diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 48ea194b..7f27784b 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -29,9 +29,11 @@ Author: Peter Boyle #ifndef GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H +NAMESPACE_CHECK(algorithms); #include #include #include +NAMESPACE_CHECK(SparseMatrix); #include #include @@ -41,10 +43,12 @@ Author: Peter Boyle #include #include #include - +NAMESPACE_CHECK(approx); #include #include +NAMESPACE_CHECK(ConjGrad); #include +NAMESPACE_CHECK(BiCGSTAB); #include #include #include @@ -62,7 +66,9 @@ Author: Peter Boyle #include #include +NAMESPACE_CHECK(PowerMethod); #include +NAMESPACE_CHECK(CoarsendMatrix); #include #endif diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 4c26f799..8d184aea 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,14 +1,3 @@ - // blockZaxpy in bockPromote - 3s, 5% - // noncoalesced linalg in Preconditionoer ~ 3s 5% - // Lancos tuning or replace 10-20s ~ 25%, open ended - // setup tuning 5s ~ 8% - // -- e.g. ordermin, orderstep tunables. - // MdagM path without norm in LinOp code. few seconds - - // Mdir calc blocking kernels - // Fuse kernels in blockMaskedInnerProduct - // preallocate Vectors in Cayley 5D ~ few percent few seconds - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -91,34 +80,7 @@ public: } directions [2*_d]=0; displacements[2*_d]=0; - - //// report back - std::cout< GetDelta(int point) { - std::vector delta(dimension,0); - delta[directions[point]] = displacements[point]; - return delta; - }; - */ }; @@ -149,25 +111,7 @@ public: CoarseScalar InnerProd(CoarseGrid); std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<oSites(),1,{ - eProj[ss](i)=CComplex(1.0); - }); - eProj=eProj - iProj; - std::cout< &hermop,int nn=nbasis) { @@ -190,12 +129,12 @@ public: FineField Mn(FineGrid); for(int b=0;b "< &hermop, int nn, double hi, @@ -280,10 +219,10 @@ public: hermop.HermOp(*Tn,y); - auto y_v = y.View(); - auto Tn_v = Tn->View(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); + autoView( y_v , y, AcceleratorWrite); + autoView( Tn_v , (*Tn), AcceleratorWrite); + autoView( Tnp_v , (*Tnp), AcceleratorWrite); + autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); @@ -313,201 +252,6 @@ public: } assert(b==nn); } -#endif -#if 0 - virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, - int nn, - double hi, - double lo, - int orderfilter, - int ordermin, - int orderstep, - double filterlo - ) { - - RealD scale; - - FineField noise(FineGrid); - FineField Mn(FineGrid); - FineField tmp(FineGrid); - FineField combined(FineGrid); - - // New normalised noise - gaussian(RNG,noise); - scale = std::pow(norm2(noise),-0.5); - noise=noise*scale; - - // Initial matrix element - hermop.Op(noise,Mn); std::cout< "< Cheb(llo,hhi,oorder); \ - Cheb(hermop,noise,Mn); \ - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ - subspace[b] = Mn; \ - hermop.Op(Mn,tmp); \ - std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ - - RealD alpha=-0.8; - RealD beta =-0.8; -#define FILTER(llo,hhi,oorder) \ - { \ - Chebyshev Cheb(llo,hhi,oorder); \ - /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ - Cheb(hermop,noise,Mn); \ - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ - subspace[b] = Mn; \ - hermop.Op(Mn,tmp); \ - std::cout< "< Cheb(llo,hhi,oorder); \ - Cheb(hermop,noise,combined); \ - } - - double node = 0.000; - FILTERb(lo,hi,orderfilter);// 0 - // FILTERc(node,hi,51);// 0 - noise = Mn; - int base = 0; - int mult = 100; - FILTER(node,hi,base+1*mult); - FILTER(node,hi,base+2*mult); - FILTER(node,hi,base+3*mult); - FILTER(node,hi,base+4*mult); - FILTER(node,hi,base+5*mult); - FILTER(node,hi,base+6*mult); - FILTER(node,hi,base+7*mult); - FILTER(node,hi,base+8*mult); - FILTER(node,hi,base+9*mult); - FILTER(node,hi,base+10*mult); - FILTER(node,hi,base+11*mult); - FILTER(node,hi,base+12*mult); - FILTER(node,hi,base+13*mult); - FILTER(node,hi,base+14*mult); - FILTER(node,hi,base+15*mult); - assert(b==nn); - } -#endif - -#if 0 - virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, - int nn, - double hi, - double lo, - int orderfilter, - int ordermin, - int orderstep, - double filterlo - ) { - - RealD scale; - - FineField noise(FineGrid); - FineField Mn(FineGrid); - FineField tmp(FineGrid); - FineField combined(FineGrid); - - // New normalised noise - gaussian(RNG,noise); - scale = std::pow(norm2(noise),-0.5); - noise=noise*scale; - - // Initial matrix element - hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); - // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); - //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); - // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); - JacobiPoly(hermop,noise,Mn); - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; - subspace[b] = Mn; - hermop.Op(Mn,tmp); - std::cout< "< "< Stencil; std::vector A; - + /////////////////////// // Interface /////////////////////// @@ -549,13 +293,13 @@ public: SimpleCompressor compressor; Stencil.HaloExchange(in,compressor); - - auto in_v = in.View(); - auto out_v = out.View(); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); typedef LatticeView Aview; - + Vector AcceleratorViewContainer; - for(int p=0;p_is_local) { - nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); } - synchronise(); + acceleratorSynchronise(); for(int bb=0;bb Aview; Vector AcceleratorViewContainer; - for(int p=0;p_is_local) { - nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); } - synchronise(); + acceleratorSynchronise(); for(int bb=0;bboSites(),1,{ - - siteVector res = Zero(); - siteVector nbr; - int ptype; - StencilEntry *SE; - - SE=Stencil.GetEntry(ptype,point,ss); - - if(SE->_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; - } else { - nbr = Stencil.CommBuf()[SE->_offset]; - } - synchronise(); - - res = res + Aview_p[point][ss]*nbr; - - out_v[ss]=res; - }); -#endif + for(int p=0;p &out) { @@ -841,10 +562,10 @@ public: blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); - auto iZProj_v = iZProj.View() ; - auto oZProj_v = oZProj.View() ; - auto A_p = A[p].View(); - auto A_self = A[self_stencil].View(); + autoView( iZProj_v , iZProj, AcceleratorRead) ; + autoView( oZProj_v , oZProj, AcceleratorRead) ; + autoView( A_p , A[p], AcceleratorWrite); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); @@ -860,11 +581,11 @@ public: mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); { - auto tmp_ = tmp.View(); - auto evenmask_ = evenmask.View(); - auto oddmask_ = oddmask.View(); - auto Mphie_ = Mphie.View(); - auto Mphio_ = Mphio.View(); + autoView( tmp_ , tmp, AcceleratorWrite); + autoView( evenmask_ , evenmask, AcceleratorRead); + autoView( oddmask_ , oddmask, AcceleratorRead); + autoView( Mphie_ , Mphie, AcceleratorRead); + autoView( Mphio_ , Mphio, AcceleratorRead); accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); }); @@ -872,8 +593,8 @@ public: blockProject(SelfProj,tmp,Subspace.subspace); - auto SelfProj_ = SelfProj.View(); - auto A_self = A[self_stencil].View(); + autoView( SelfProj_ , SelfProj, AcceleratorRead); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j bc(FineGrid->_ndimension,0); - - blockPick(Grid(),phi,tmp,bc); // Pick out a block - linop.Op(tmp,Mphi); // Apply big dop - blockProject(iProj,Mphi,Subspace.subspace); // project it and print it - std::cout< #endif #endif - NAMESPACE_BEGIN(Grid); template struct FFTW { }; @@ -191,7 +189,7 @@ public: typedef typename sobj::scalar_type scalar; Lattice pgbuf(&pencil_g); - auto pgbuf_v = pgbuf.View(); + autoView(pgbuf_v , pgbuf, CpuWrite); typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -232,15 +230,18 @@ public: result = source; int pc = processor_coor[dim]; for(int p=0;plSites(),{ + { + autoView(r_v,result,CpuRead); + autoView(p_v,pgbuf,CpuWrite); + thread_for(idx, sgrid->lSites(),{ Coordinate cbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,cbuf); - peekLocalSite(s,result,cbuf); + peekLocalSite(s,r_v,cbuf); cbuf[dim]+=((pc+p) % processors[dim])*L; - // cbuf[dim]+=p*L; - pokeLocalSite(s,pgbuf,cbuf); - }); + pokeLocalSite(s,p_v,cbuf); + }); + } if (p != processors[dim] - 1) { result = Cshift(result,dim,L); } @@ -269,15 +270,19 @@ public: flops+= flops_call*NN; // writing out result - thread_for(idx,sgrid->lSites(),{ + { + autoView(pgbuf_v,pgbuf,CpuRead); + autoView(result_v,result,CpuWrite); + thread_for(idx,sgrid->lSites(),{ Coordinate clbuf(Nd), cgbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,clbuf); cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; - peekLocalSite(s,pgbuf,cgbuf); - pokeLocalSite(s,result,clbuf); - }); + peekLocalSite(s,pgbuf_v,cgbuf); + pokeLocalSite(s,result_v,clbuf); + }); + } result = result*div; // destroying plan diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h index 3a7be1ef..f4e5cdda 100644 --- a/Grid/algorithms/iterative/BiCGSTAB.h +++ b/Grid/algorithms/iterative/BiCGSTAB.h @@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction LinearCombTimer.Start(); bo = beta * omega; - auto p_v = p.View(); - auto r_v = r.View(); - auto v_v = v.View(); - accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction alpha = rho / Calpha.real(); LinearCombTimer.Start(); - auto h_v = h.View(); - auto psi_v = psi.View(); - accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); - }); - - auto s_v = s.View(); - accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorRead); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + autoView( psi_v,psi, AcceleratorRead); + autoView( h_v , h, AcceleratorWrite); + autoView( s_v , s, AcceleratorWrite); + accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); + }); + accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction omega = Comega.real() / norm2(t); LinearCombTimer.Start(); - auto t_v = t.View(); - accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); - coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); - }); + { + autoView( psi_v,psi, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); + autoView( h_v , h, AcceleratorRead); + autoView( s_v , s, AcceleratorRead); + autoView( t_v , t, AcceleratorRead); + accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); + coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); + }); + } LinearCombTimer.Stop(); - + cp = norm2(r); LinalgTimer.Stop(); diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 3a2544b5..14f3d306 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -140,13 +140,15 @@ public: b = cp / c; LinearCombTimer.Start(); - auto psi_v = psi.View(); - auto p_v = p.View(); - auto r_v = r.View(); - accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); - coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); - }); + { + autoView( psi_v , psi, AcceleratorWrite); + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); + accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); + coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h new file mode 100644 index 00000000..22b7725e --- /dev/null +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -0,0 +1,241 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_PREC_GCR_NON_HERM_H +#define GRID_PREC_GCR_NON_HERM_H + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +//VPGCR Abe and Zhang, 2005. +//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING +//Computing and Information Volume 2, Number 2, Pages 147-161 +//NB. Likely not original reference since they are focussing on a preconditioner variant. +// but VPGCR was nicely written up in their paper +/////////////////////////////////////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); + +#define GCRLogLevel std::cout << GridLogMessage < +class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction { +public: + + RealD Tolerance; + Integer MaxIterations; + int verbose; + int mmax; + int nstep; + int steps; + int level; + GridStopWatch PrecTimer; + GridStopWatch MatTimer; + GridStopWatch LinalgTimer; + + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; + + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : + Tolerance(tol), + MaxIterations(maxit), + Linop(_Linop), + Preconditioner(Prec), + mmax(_mmax), + nstep(_nstep) + { + level=1; + verbose=1; + }; + + void operator() (const Field &src, Field &psi){ + + psi=Zero(); + RealD cp, ssq,rsq; + ssq=norm2(src); + rsq=Tolerance*Tolerance*ssq; + + Field r(src.Grid()); + + PrecTimer.Reset(); + MatTimer.Reset(); + LinalgTimer.Reset(); + + GridStopWatch SolverTimer; + SolverTimer.Start(); + + steps=0; + for(int k=0;k q(mmax,grid); + std::vector p(mmax,grid); + std::vector qq(mmax); + + GCRLogLevel<< "PGCR nStep("<(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. + for(int back=0;back=0); + + b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; + p[peri_kp]=p[peri_kp]+b*p[peri_back]; + q[peri_kp]=q[peri_kp]+b*q[peri_back]; + + } + qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm + LinalgTimer.Stop(); + } + assert(0); // never reached + return cp; + } +}; +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index ef6459ed..0d1707d9 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; -int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; -#ifdef GRID_CUDA -int PointerCache::Ncache = 32; -#else -int PointerCache::Ncache = 8; -#endif -int PointerCache::Victim; -int PointerCache::VictimSmall; -PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; -PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax]; - -void PointerCache::Init(void) -{ - char * str; - - str= getenv("GRID_ALLOC_NCACHE_LARGE"); - if ( str ) Ncache = atoi(str); - if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; - - str= getenv("GRID_ALLOC_NCACHE_SMALL"); - if ( str ) NcacheSmall = atoi(str); - if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; - - // printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax); -} -void *PointerCache::Insert(void *ptr,size_t bytes) -{ - if (bytes < GRID_ALLOC_SMALL_LIMIT ) - return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall); - return Insert(ptr,bytes,Entries,Ncache,Victim); -} -void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) -{ -#ifdef GRID_OMP - assert(omp_in_parallel()==0); -#endif - - void * ret = NULL; - int v = -1; - - for(int e=0;e See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#ifndef GRID_ALIGNED_ALLOCATOR_H -#define GRID_ALIGNED_ALLOCATOR_H - -#ifdef HAVE_MALLOC_MALLOC_H -#include -#endif -#ifdef HAVE_MALLOC_H -#include -#endif - -#ifdef HAVE_MM_MALLOC_H -#include -#endif - -#define POINTER_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) -#define GRID_ALLOC_SMALL_LIMIT (4096) +#pragma once NAMESPACE_BEGIN(Grid); -// Move control to configure.ac and Config.h? - -class PointerCache { -private: -/*Pinning pages is costly*/ -/*Could maintain separate large and small allocation caches*/ -/* Could make these configurable, perhaps up to a max size*/ - static const int NcacheSmallMax=128; - static const int NcacheMax=16; - static int NcacheSmall; - static int Ncache; - - typedef struct { - void *address; - size_t bytes; - int valid; - } PointerCacheEntry; - - static PointerCacheEntry Entries[NcacheMax]; - static int Victim; - static PointerCacheEntry EntriesSmall[NcacheSmallMax]; - static int VictimSmall; - -public: - static void Init(void); - static void *Insert(void *ptr,size_t bytes) ; - static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; - static void *Lookup(size_t bytes) ; - static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ; -}; - -std::string sizeString(size_t bytes); - -struct MemoryStats -{ - size_t totalAllocated{0}, maxAllocated{0}, - currentlyAllocated{0}, totalFreed{0}; -}; - -class MemoryProfiler -{ -public: - static MemoryStats *stats; - static bool debug; -}; - -#ifdef GRID_NVCC -#define profilerCudaMeminfo \ - { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<totalAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ - << std::endl; \ - } \ - profilerCudaMeminfo; - -#define profilerAllocate(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalAllocated += (bytes); \ - s->currentlyAllocated += (bytes); \ - s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -#define profilerFree(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalFreed += (bytes); \ - s->currentlyAllocated -= (bytes); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -void check_huge_pages(void *Buf,uint64_t BYTES); - -//////////////////////////////////////////////////////////////////// -// A lattice of something, but assume the something is SIMDized. -//////////////////////////////////////////////////////////////////// - template class alignedAllocator { public: @@ -172,70 +53,60 @@ public: { size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); - - -#ifdef POINTER_CACHE - _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); -#else - pointer ptr = nullptr; -#endif - -#ifdef GRID_NVCC - //////////////////////////////////// - // Unified (managed) memory - //////////////////////////////////// - if ( ptr == (_Tp *) NULL ) { - // printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout); - auto err = cudaMallocManaged((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (_Tp *) NULL; - std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " < inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } +template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } -#ifdef GRID_NVCC - if ( __freeme ) cudaFree((void *)__freeme); -#else - #ifdef HAVE_MM_MALLOC_H - if ( __freeme ) _mm_free((void *)__freeme); - #else - if ( __freeme ) free((void *)__freeme); - #endif -#endif +template +class uvmAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef uvmAllocator<_Tp1> other; }; + uvmAllocator() throw() { } + uvmAllocator(const uvmAllocator&) throw() { } + template uvmAllocator(const uvmAllocator<_Tp1>&) throw() { } + ~uvmAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::SharedFree((void *)__p,bytes); } // FIXME: hack for the copy constructor, eventually it must be avoided @@ -244,17 +115,17 @@ public: void construct(pointer __p) { }; void destroy(pointer __p) { }; }; -template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } -template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } +template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = alignedAllocator; -template using Vector = std::vector >; -template using commVector = std::vector >; -template using Matrix = std::vector > >; +template using commAllocator = uvmAllocator; +template using Vector = std::vector >; +template using commVector = std::vector >; +//template using Matrix = std::vector > >; NAMESPACE_END(Grid); -#endif + diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h new file mode 100644 index 00000000..589ea36f --- /dev/null +++ b/Grid/allocator/Allocator.h @@ -0,0 +1,4 @@ +#pragma once +#include +#include +#include diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc new file mode 100644 index 00000000..e11ce948 --- /dev/null +++ b/Grid/allocator/MemoryManager.cc @@ -0,0 +1,244 @@ +#include + +NAMESPACE_BEGIN(Grid); + +/*Allocation types, saying which pointer cache should be used*/ +#define Cpu (0) +#define CpuSmall (1) +#define Acc (2) +#define AccSmall (3) +#define Shared (4) +#define SharedSmall (5) +uint64_t total_shared; +uint64_t total_device; +uint64_t total_host;; +void MemoryManager::PrintBytes(void) +{ + std::cout << " MemoryManager : "<=0) && (Nc < NallocCacheMax)) { + Ncache[Cpu]=Nc; + Ncache[Acc]=Nc; + Ncache[Shared]=Nc; + } + } + + str= getenv("GRID_ALLOC_NCACHE_SMALL"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuSmall]=Nc; + Ncache[AccSmall]=Nc; + Ncache[SharedSmall]=Nc; + } + } + std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + + void * ret = NULL; + int v = -1; + + for(int e=0;e0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + for(int e=0;e +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once +#include +#include + +NAMESPACE_BEGIN(Grid); + +// Move control to configure.ac and Config.h? + +#define ALLOCATION_CACHE +#define GRID_ALLOC_ALIGN (2*1024*1024) +#define GRID_ALLOC_SMALL_LIMIT (4096) + +/*Pinning pages is costly*/ +//////////////////////////////////////////////////////////////////////////// +// Advise the LatticeAccelerator class +//////////////////////////////////////////////////////////////////////////// +enum ViewAdvise { + AdviseDefault = 0x0, // Regular data + AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + + // AdviseTransient = 0x2, // Data will mostly be read. On some architectures + // enables read-only copies of memory to be kept on + // host and device. + + // AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device + +}; + +//////////////////////////////////////////////////////////////////////////// +// View Access Mode +//////////////////////////////////////////////////////////////////////////// +enum ViewMode { + AcceleratorRead = 0x01, + AcceleratorWrite = 0x02, + AcceleratorWriteDiscard = 0x04, + CpuRead = 0x08, + CpuWrite = 0x10, + CpuWriteDiscard = 0x10 // same for now +}; + +class MemoryManager { +private: + + //////////////////////////////////////////////////////////// + // For caching recently freed allocations + //////////////////////////////////////////////////////////// + typedef struct { + void *address; + size_t bytes; + int valid; + } AllocationCacheEntry; + + static const int NallocCacheMax=128; + static const int NallocType=6; + static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; + static int Victim[NallocType]; + static int Ncache[NallocType]; + + ///////////////////////////////////////////////// + // Free pool + ///////////////////////////////////////////////// + static void *Insert(void *ptr,size_t bytes,int type) ; + static void *Lookup(size_t bytes,int type) ; + static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; + static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; + + static void *AcceleratorAllocate(size_t bytes); + static void AcceleratorFree (void *ptr,size_t bytes); + static void PrintBytes(void); + public: + static void Init(void); + static void *SharedAllocate(size_t bytes); + static void SharedFree (void *ptr,size_t bytes); + static void *CpuAllocate(size_t bytes); + static void CpuFree (void *ptr,size_t bytes); + + //////////////////////////////////////////////////////// + // Footprint tracking + //////////////////////////////////////////////////////// + static uint64_t DeviceBytes; + static uint64_t DeviceLRUBytes; + static uint64_t DeviceMaxBytes; + static uint64_t HostToDeviceBytes; + static uint64_t DeviceToHostBytes; + static uint64_t HostToDeviceXfer; + static uint64_t DeviceToHostXfer; + + private: +#ifndef GRID_UVM + ////////////////////////////////////////////////////////////////////// + // Data tables for ViewCache + ////////////////////////////////////////////////////////////////////// + typedef std::list LRU_t; + typedef typename LRU_t::iterator LRUiterator; + typedef struct { + int LRU_valid; + LRUiterator LRU_entry; + uint64_t CpuPtr; + uint64_t AccPtr; + size_t bytes; + uint32_t transient; + uint32_t state; + uint32_t accLock; + uint32_t cpuLock; + } AcceleratorViewEntry; + + typedef std::unordered_map AccViewTable_t; + typedef typename AccViewTable_t::iterator AccViewTableIterator ; + + static AccViewTable_t AccViewTable; + static LRU_t LRU; + + ///////////////////////////////////////////////// + // Device motion + ///////////////////////////////////////////////// + static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void EvictVictims(uint64_t bytes); // Frees up + static void Evict(AcceleratorViewEntry &AccCache); + static void Flush(AcceleratorViewEntry &AccCache); + static void Clone(AcceleratorViewEntry &AccCache); + static void AccDiscard(AcceleratorViewEntry &AccCache); + static void CpuDiscard(AcceleratorViewEntry &AccCache); + + // static void LRUupdate(AcceleratorViewEntry &AccCache); + static void LRUinsert(AcceleratorViewEntry &AccCache); + static void LRUremove(AcceleratorViewEntry &AccCache); + + // manage entries in the table + static int EntryPresent(uint64_t CpuPtr); + static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void EntryErase (uint64_t CpuPtr); + static AccViewTableIterator EntryLookup(uint64_t CpuPtr); + static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry); + + static void AcceleratorViewClose(uint64_t AccPtr); + static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void CpuViewClose(uint64_t Ptr); + static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); +#endif + static void NotifyDeletion(void * CpuPtr); + + public: + static void Print(void); + static int isOpen (void* CpuPtr); + static void ViewClose(void* CpuPtr,ViewMode mode); + static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + +}; + +NAMESPACE_END(Grid); + + diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc new file mode 100644 index 00000000..5dd7575e --- /dev/null +++ b/Grid/allocator/MemoryManagerCache.cc @@ -0,0 +1,468 @@ +#include + +#ifndef GRID_UVM + +#warning "Using explicit device memory copies" +NAMESPACE_BEGIN(Grid); +#define dprintf(...) + +//////////////////////////////////////////////////////////// +// For caching copies of data on device +//////////////////////////////////////////////////////////// +MemoryManager::AccViewTable_t MemoryManager::AccViewTable; +MemoryManager::LRU_t MemoryManager::LRU; + +//////////////////////////////////////////////////////// +// Footprint tracking +//////////////////////////////////////////////////////// +uint64_t MemoryManager::DeviceBytes; +uint64_t MemoryManager::DeviceLRUBytes; +uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128; +uint64_t MemoryManager::HostToDeviceBytes; +uint64_t MemoryManager::DeviceToHostBytes; +uint64_t MemoryManager::HostToDeviceXfer; +uint64_t MemoryManager::DeviceToHostXfer; + +//////////////////////////////////// +// Priority ordering for unlocked entries +// Empty +// CpuDirty +// Consistent +// AccDirty +//////////////////////////////////// +#define Empty (0x0) /*Entry unoccupied */ +#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/ +#define Consistent (0x2) /*ACC copy AND CPU copy are valid */ +#define AccDirty (0x4) /*ACC copy is golden */ +#define EvictNext (0x8) /*Priority for eviction*/ + +///////////////////////////////////////////////// +// Mechanics of data table maintenance +///////////////////////////////////////////////// +int MemoryManager::EntryPresent(uint64_t CpuPtr) +{ + if(AccViewTable.empty()) return 0; + + auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1)); + return count; +} +void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + assert(!EntryPresent(CpuPtr)); + AcceleratorViewEntry AccCache; + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; + AccCache.LRU_valid=0; + AccCache.transient=0; + AccCache.accLock=0; + AccCache.cpuLock=0; + AccViewTable[CpuPtr] = AccCache; +} +MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) +{ + assert(EntryPresent(CpuPtr)); + auto AccCacheIterator = AccViewTable.find(CpuPtr); + assert(AccCacheIterator!=AccViewTable.end()); + return AccCacheIterator; +} +void MemoryManager::EntryErase(uint64_t CpuPtr) +{ + auto AccCache = EntryLookup(CpuPtr); + AccViewTable.erase(CpuPtr); +} +void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.LRU_valid==0); + if (AccCache.transient) { + LRU.push_back(AccCache.CpuPtr); + AccCache.LRU_entry = --LRU.end(); + } else { + LRU.push_front(AccCache.CpuPtr); + AccCache.LRU_entry = LRU.begin(); + } + AccCache.LRU_valid = 1; + DeviceLRUBytes+=AccCache.bytes; +} +void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.LRU_valid==1); + LRU.erase(AccCache.LRU_entry); + AccCache.LRU_valid = 0; + DeviceLRUBytes-=AccCache.bytes; +} +///////////////////////////////////////////////// +// Accelerator cache motion & consistency logic +///////////////////////////////////////////////// +void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) +{ + /////////////////////////////////////////////////////////// + // Remove from Accelerator, remove entry, without flush + // Cannot be locked. If allocated Must be in LRU pool. + /////////////////////////////////////////////////////////// + assert(AccCache.state!=Empty); + + // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + assert(AccCache.accLock==0); + assert(AccCache.cpuLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr) { + AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceBytes -=AccCache.bytes; + LRUremove(AccCache); + // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + } + uint64_t CpuPtr = AccCache.CpuPtr; + EntryErase(CpuPtr); +} + +void MemoryManager::Evict(AcceleratorViewEntry &AccCache) +{ + /////////////////////////////////////////////////////////////////////////// + // Make CPU consistent, remove from Accelerator, remove entry + // Cannot be locked. If allocated must be in LRU pool. + /////////////////////////////////////////////////////////////////////////// + assert(AccCache.state!=Empty); + + // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + assert(AccCache.accLock==0); + assert(AccCache.cpuLock==0); + if(AccCache.state==AccDirty) { + Flush(AccCache); + } + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr) { + AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceBytes -=AccCache.bytes; + LRUremove(AccCache); + // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + } + uint64_t CpuPtr = AccCache.CpuPtr; + EntryErase(CpuPtr); +} +void MemoryManager::Flush(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state==AccDirty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.AccPtr!=(uint64_t)NULL); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); + // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + DeviceToHostBytes+=AccCache.bytes; + DeviceToHostXfer++; + AccCache.state=Consistent; +} +void MemoryManager::Clone(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state==CpuDirty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr==(uint64_t)NULL){ + AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); + DeviceBytes+=AccCache.bytes; + } + // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); + HostToDeviceBytes+=AccCache.bytes; + HostToDeviceXfer++; + AccCache.state=Consistent; +} + +void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state!=Empty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr==(uint64_t)NULL){ + AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); + DeviceBytes+=AccCache.bytes; + } + AccCache.state=AccDirty; +} + +///////////////////////////////////////////////////////////////////////////////// +// View management +///////////////////////////////////////////////////////////////////////////////// +void MemoryManager::ViewClose(void* Ptr,ViewMode mode) +{ + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + AcceleratorViewClose((uint64_t)Ptr); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + CpuViewClose((uint64_t)Ptr); + } else { + assert(0); + } +} +void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + uint64_t CpuPtr = (uint64_t)_CpuPtr; + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); + } else { + assert(0); + return NULL; + } +} +void MemoryManager::EvictVictims(uint64_t bytes) +{ + while(bytes+DeviceLRUBytes > DeviceMaxBytes){ + if ( DeviceLRUBytes > 0){ + assert(LRU.size()>0); + uint64_t victim = LRU.back(); + auto AccCacheIterator = EntryLookup(victim); + auto & AccCache = AccCacheIterator->second; + Evict(AccCache); + } + } +} +uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + if ( EntryPresent(CpuPtr)==0 ){ + EvictVictims(bytes); + EntryCreate(CpuPtr,bytes,mode,hint); + } + + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); + + assert(AccCache.cpuLock==0); // Programming error + + if(AccCache.state!=Empty) { + assert(AccCache.CpuPtr == CpuPtr); + assert(AccCache.bytes ==bytes); + } +/* + * State transitions and actions + * + * Action State StateNext Flush Clone + * + * AccRead Empty Consistent - Y + * AccWrite Empty AccDirty - Y + * AccRead CpuDirty Consistent - Y + * AccWrite CpuDirty AccDirty - Y + * AccRead Consistent Consistent - - + * AccWrite Consistent AccDirty - - + * AccRead AccDirty AccDirty - - + * AccWrite AccDirty AccDirty - - + */ + if(AccCache.state==Empty) { + assert(AccCache.LRU_valid==0); + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; // Cpu starts primary + if(mode==AcceleratorWriteDiscard){ + CpuDiscard(AccCache); + AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite){ + Clone(AccCache); + AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else { + Clone(AccCache); + AccCache.state = Consistent; // Empty + AccRead => Consistent + } + AccCache.accLock= 1; + } else if(AccCache.state==CpuDirty ){ + if(mode==AcceleratorWriteDiscard) { + CpuDiscard(AccCache); + AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite) { + Clone(AccCache); + AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else { + Clone(AccCache); + AccCache.state = Consistent; // CpuDirty + AccRead => Consistent + } + AccCache.accLock++; + // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + } else if(AccCache.state==Consistent) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty + else + AccCache.state = Consistent; // Consistent + AccRead => Consistent + AccCache.accLock++; + // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + } else if(AccCache.state==AccDirty) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty + else + AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty + AccCache.accLock++; + // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + } else { + assert(0); + } + + // If view is opened on device remove from LRU + if(AccCache.LRU_valid==1){ + // must possibly remove from LRU as now locked on GPU + LRUremove(AccCache); + } + + int transient =hint; + AccCache.transient= transient? EvictNext : 0; + + return AccCache.AccPtr; +} +//////////////////////////////////// +// look up & decrement lock count +//////////////////////////////////// +void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) +{ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert(AccCache.cpuLock==0); + assert(AccCache.accLock>0); + + AccCache.accLock--; + + // Move to LRU queue if not locked and close on device + if(AccCache.accLock==0) { + LRUinsert(AccCache); + } +} +void MemoryManager::CpuViewClose(uint64_t CpuPtr) +{ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert(AccCache.cpuLock>0); + assert(AccCache.accLock==0); + + AccCache.cpuLock--; +} +/* + * Action State StateNext Flush Clone + * + * CpuRead Empty CpuDirty - - + * CpuWrite Empty CpuDirty - - + * CpuRead CpuDirty CpuDirty - - + * CpuWrite CpuDirty CpuDirty - - + * CpuRead Consistent Consistent - - + * CpuWrite Consistent CpuDirty - - + * CpuRead AccDirty Consistent Y - + * CpuWrite AccDirty CpuDirty Y - + */ +uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + if ( EntryPresent(CpuPtr)==0 ){ + EvictVictims(bytes); + EntryCreate(CpuPtr,bytes,mode,transient); + } + + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert((mode==CpuRead)||(mode==CpuWrite)); + assert(AccCache.accLock==0); // Programming error + + if(AccCache.state!=Empty) { + assert(AccCache.CpuPtr == CpuPtr); + assert(AccCache.bytes==bytes); + } + + if(AccCache.state==Empty) { + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty + AccCache.accLock= 0; + AccCache.cpuLock= 1; + } else if(AccCache.state==CpuDirty ){ + // AccPtr dont care, deferred allocate + AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty + AccCache.cpuLock++; + } else if(AccCache.state==Consistent) { + assert(AccCache.AccPtr != (uint64_t)NULL); + if(mode==CpuWrite) + AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty + else + AccCache.state = Consistent; // Consistent +CpuRead => Consistent + AccCache.cpuLock++; + } else if(AccCache.state==AccDirty) { + assert(AccCache.AccPtr != (uint64_t)NULL); + Flush(AccCache); + if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush + else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush + AccCache.cpuLock++; + } else { + assert(0); // should be unreachable + } + + AccCache.transient= transient? EvictNext : 0; + + return AccCache.CpuPtr; +} +void MemoryManager::NotifyDeletion(void *_ptr) +{ + // Look up in ViewCache + uint64_t ptr = (uint64_t)_ptr; + if(EntryPresent(ptr)) { + auto e = EntryLookup(ptr); + AccDiscard(e->second); + } +} +void MemoryManager::Print(void) +{ + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << "Memory Manager " << std::endl; + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; + std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; + std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; + std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; + std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; + std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; + std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; + std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + std::cout << GridLogDebug << "0x"<second; + return AccCache.cpuLock+AccCache.accLock; + } else { + return 0; + } +} + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc new file mode 100644 index 00000000..537f7c32 --- /dev/null +++ b/Grid/allocator/MemoryManagerShared.cc @@ -0,0 +1,24 @@ +#include +#ifdef GRID_UVM + +#warning "Grid is assuming unified virtual memory address space" +NAMESPACE_BEGIN(Grid); +///////////////////////////////////////////////////////////////////////////////// +// View management is 1:1 address space mapping +///////////////////////////////////////////////////////////////////////////////// +uint64_t MemoryManager::DeviceBytes; +uint64_t MemoryManager::DeviceLRUBytes; +uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128; +uint64_t MemoryManager::HostToDeviceBytes; +uint64_t MemoryManager::DeviceToHostBytes; +uint64_t MemoryManager::HostToDeviceXfer; +uint64_t MemoryManager::DeviceToHostXfer; + +void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; +void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; +int MemoryManager::isOpen (void* CpuPtr) { return 0;} +void MemoryManager::Print(void){}; +void MemoryManager::NotifyDeletion(void *ptr){}; + +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc new file mode 100644 index 00000000..0d1707d9 --- /dev/null +++ b/Grid/allocator/MemoryStats.cc @@ -0,0 +1,67 @@ +#include +#include + +NAMESPACE_BEGIN(Grid); + +MemoryStats *MemoryProfiler::stats = nullptr; +bool MemoryProfiler::debug = false; + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + +std::string sizeString(const size_t bytes) +{ + constexpr unsigned int bufSize = 256; + const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; + char buf[256]; + size_t s = 0; + double count = bytes; + + while (count >= 1024 && s < 7) + { + s++; + count /= 1024; + } + if (count - floor(count) == 0.0) + { + snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); + } + else + { + snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); + } + + return std::string(buf); +} + +NAMESPACE_END(Grid); + diff --git a/Grid/allocator/MemoryStats.h b/Grid/allocator/MemoryStats.h new file mode 100644 index 00000000..156c9747 --- /dev/null +++ b/Grid/allocator/MemoryStats.h @@ -0,0 +1,95 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/MemoryStats.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + + +NAMESPACE_BEGIN(Grid); + +std::string sizeString(size_t bytes); + +struct MemoryStats +{ + size_t totalAllocated{0}, maxAllocated{0}, + currentlyAllocated{0}, totalFreed{0}; +}; + +class MemoryProfiler +{ +public: + static MemoryStats *stats; + static bool debug; +}; + +#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" +#define profilerDebugPrint \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ + << std::endl; \ + } + +#define profilerAllocate(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalAllocated += (bytes); \ + s->currentlyAllocated += (bytes); \ + s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +#define profilerFree(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalFreed += (bytes); \ + s->currentlyAllocated -= (bytes); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +void check_huge_pages(void *Buf,uint64_t BYTES); + +NAMESPACE_END(Grid); + diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h index 87472cc9..ae1fd1fd 100644 --- a/Grid/cartesian/Cartesian_base.h +++ b/Grid/cartesian/Cartesian_base.h @@ -81,6 +81,7 @@ public: bool _isCheckerBoarded; int LocallyPeriodic; + Coordinate _checker_dim_mask; public: diff --git a/Grid/cartesian/Cartesian_full.h b/Grid/cartesian/Cartesian_full.h index c083817b..31a67bf0 100644 --- a/Grid/cartesian/Cartesian_full.h +++ b/Grid/cartesian/Cartesian_full.h @@ -38,6 +38,7 @@ class GridCartesian: public GridBase { public: int dummy; + Coordinate _checker_dim_mask; virtual int CheckerBoardFromOindexTable (int Oindex) { return 0; } @@ -104,6 +105,7 @@ public: _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); _simd_layout.resize(_ndimension); + _checker_dim_mask.resize(_ndimension);; _lstart.resize(_ndimension); _lend.resize(_ndimension); @@ -114,6 +116,8 @@ public: for (int d = 0; d < _ndimension; d++) { + _checker_dim_mask[d]=0; + _fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions _simd_layout[d] = simd_layout[d]; diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index 34f763d2..b71981f5 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -35,12 +35,28 @@ static const int CbRed =0; static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; + +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +{ + int nd=rdim.size(); + Coordinate coor(nd); + + Lexicographic::CoorFromIndex(coor,oindex,rdim); + + int linear=0; + for(int d=0;d _checker_board; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ed465252..45fefc71 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -29,7 +29,7 @@ Author: Peter Boyle #include #include -#ifdef GRID_NVCC +#ifdef GRID_CUDA #include #endif @@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD std::vector primes({2,3,5}); int dim = 0; + int last_dim = ndimension - 1; int AutoShmSize = 1; while(AutoShmSize != WorldShmSize) { - for(int p=0;p NAMESPACE_BEGIN(Grid); +extern Vector > Cshift_table; + /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int e2=rhs.Grid()->_slice_block[dimension]; int ent = 0; - static Vector > table; table.resize(e1*e2); + if(Cshift_table.size()_slice_stride[dimension]; - auto rhs_v = rhs.View(); if ( cbmask == 0x3 ) { for(int n=0;n(off+bo+b,so+o+b); + Cshift_table[ent++] = std::pair(off+bo+b,so+o+b); } } } else { @@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int o = n*stride; int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb &cbmask ) { - table[ent++]=std::pair (off+bo++,so+o+b); + Cshift_table[ent++]=std::pair (off+bo++,so+o+b); } } } } - thread_for(i,ent,{ - buffer[table[i].first]=rhs_v[table[i].second]; - }); + { + autoView(rhs_v , rhs, AcceleratorRead); + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); + } } /////////////////////////////////////////////////////////////////// @@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice &rhs, int e2=rhs.Grid()->_slice_block[dimension]; int n1=rhs.Grid()->_slice_stride[dimension]; - auto rhs_v = rhs.View(); if ( cbmask ==0x3){ - thread_for_collapse(2,n,e1,{ - for(int b=0;b(temp,pointers,offset); - } - }); + }); } else { + autoView(rhs_v , rhs, AcceleratorRead); - // Case of SIMD split AND checker dim cannot currently be hit, except in - // Test_cshift_red_black code. - std::cout << " Dense packed buffer WARNING " <_rdimensions; + Coordinate cdm =rhs.Grid()->_checker_dim_mask; + std::cout << " Dense packed buffer WARNING " <CheckerBoardFromOindex(o+b); + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); } - } - }); + }); } } @@ -145,7 +154,8 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_block[dimension]; int stride=rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size() void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension]; - table[ent++] = std::pair(so+o+b,bo+b); + Cshift_table[ent++] = std::pair(so+o+b,bo+b); } } @@ -165,16 +175,20 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { - table[ent++]=std::pair (so+o+b,bo++); + Cshift_table[ent++]=std::pair (so+o+b,bo++); } } } } - auto rhs_v = rhs.View(); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer[table[i].second]; - }); + { + autoView( rhs_v, rhs, AcceleratorWrite); + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + rhs_v[table[i].first]=buffer_p[table[i].second]; + }); + } } ////////////////////////////////////////////////////// @@ -194,21 +208,19 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - auto rhs_v = rhs.View(); - thread_for_collapse(2,n,e1,{ - for(int b=0;b_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; merge(rhs_v[so+o+b],pointers,offset); - } - }); + }); } else { // Case of SIMD split AND checker dim cannot currently be hit, except in // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<_slice_stride[dimension]; @@ -225,6 +237,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// + template void Copy_plane(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -239,14 +252,16 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e2=rhs.Grid()->_slice_block[dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + + if(Cshift_table.size()(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } else { @@ -255,23 +270,24 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int o =n*stride+b; int ocb=1<CheckerBoardFromOindex(o); if ( ocb&cbmask ) { - table[ent++] = std::pair(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } } - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for(i,ent,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); - + { + autoView(rhs_v , rhs, AcceleratorRead); + autoView(lhs_v , lhs, AcceleratorWrite); + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); + } } template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { - int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -285,29 +301,33 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_block [dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size()(lo+o+b,ro+o+b); + Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } else { for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) table[ent++] = std::pair(lo+o+b,ro+o+b); + if ( ocb&cbmask ) Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for(i,ent,{ - permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); - }); + { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); + }); + } } ////////////////////////////////////////////////////// diff --git a/Grid/cshift/Cshift_table.cc b/Grid/cshift/Cshift_table.cc new file mode 100644 index 00000000..d46e51c0 --- /dev/null +++ b/Grid/cshift/Cshift_table.cc @@ -0,0 +1,4 @@ +#include +NAMESPACE_BEGIN(Grid); +Vector > Cshift_table; +NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 036633b4..a3017198 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -26,6 +26,7 @@ Author: Peter Boyle *************************************************************************************/ /* END LEGAL */ #pragma once +#include #include #include #include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index da63d5e6..91b456d9 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -92,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView &arg) { return arg[ss]; } + +// What needs this? +// Cannot be legal on accelerator +// Comparison must convert +#if 1 template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { - auto view = arg.AcceleratorView(ViewRead); + auto view = arg.View(AcceleratorRead); return view[ss]; } +#endif /////////////////////////////////////////////////// // handle nodes in syntax tree- eval one operand @@ -180,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf cb = lat.Checkerboard(); } template ::value, T1>::type * = nullptr> -inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf -{ -} - +inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf template inline void CBFromExpression(int &cb,const LatticeUnaryExpression &expr) { CBFromExpression(cb, expr.arg1); // recurse AST } - template inline void CBFromExpression(int &cb,const LatticeBinaryExpression &expr) { @@ -204,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression::value, T1>::type * = nullptr> +inline void ExpressionViewOpen(T1 &lat) // Lattice leaf +{ + lat.ViewOpen(AcceleratorRead); +} +template ::value, T1>::type * = nullptr> + inline void ExpressionViewOpen(T1 ¬lat) {} + +template inline +void ExpressionViewOpen(LatticeUnaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST +} + +template inline +void ExpressionViewOpen(LatticeBinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST +} +template +inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST + ExpressionViewOpen(expr.arg3); // recurse AST +} + +////////////////////////////////////////////////////////////////////////// +// ViewClose +////////////////////////////////////////////////////////////////////////// +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose( T1 &lat) // Lattice leaf +{ + lat.ViewClose(); +} +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose(T1 ¬lat) {} + +template inline +void ExpressionViewClose(LatticeUnaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST +} +template inline +void ExpressionViewClose(LatticeBinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST +} +template +inline void ExpressionViewClose(LatticeTrinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST + ExpressionViewClose(expr.arg3); // recurse AST +} + //////////////////////////////////////////// // Unary operators and funcs //////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index c4a67620..a3ae1f28 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -56,9 +56,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -73,9 +73,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -89,9 +89,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -108,8 +108,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -121,8 +121,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -135,8 +135,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -148,8 +148,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -165,8 +165,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -179,8 +179,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -193,8 +193,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -206,8 +206,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -221,9 +221,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto x_v = x.AcceleratorView(ViewRead); - auto y_v = y.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -234,9 +234,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto x_v = x.AcceleratorView(ViewRead); - auto y_v = y.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 74525cc1..73b1b6a1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + #pragma once #define STREAMING_STORES @@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid); extern int GridCshiftPermuteMap[4][16]; -/////////////////////////////////////////////////////////////////// -// Base class which can be used by traits to pick up behaviour -/////////////////////////////////////////////////////////////////// -class LatticeBase {}; - -///////////////////////////////////////////////////////////////////////////////////////// -// Conformable checks; same instance of Grid required -///////////////////////////////////////////////////////////////////////////////////////// -void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) -{ - assert(lhs == rhs); -} - -//////////////////////////////////////////////////////////////////////////// -// Advise the LatticeAccelerator class -//////////////////////////////////////////////////////////////////////////// -enum LatticeAcceleratorAdvise { - AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can - // significantly influence performance of bulk storage. - AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures - // enables read-only copies of memory to be kept on - // host and device. -}; - -//////////////////////////////////////////////////////////////////////////// -// View Access Mode -//////////////////////////////////////////////////////////////////////////// -enum ViewMode { - ViewRead = 0x1, - ViewWrite = 0x2, - ViewReadWrite = 0x3 -}; - -//////////////////////////////////////////////////////////////////////////// -// Minimal base class containing only data valid to access from accelerator -// _odata will be a managed pointer in CUDA -//////////////////////////////////////////////////////////////////////////// -// Force access to lattice through a view object. -// prevents writing of code that will not offload to GPU, but perhaps annoyingly -// strict since host could could in principle direct access through the lattice object -// Need to decide programming model. -#define LATTICE_VIEW_STRICT -template class LatticeAccelerator : public LatticeBase -{ -protected: - GridBase *_grid; - int checkerboard; - vobj *_odata; // A managed pointer - uint64_t _odata_size; -public: - accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; - accelerator_inline uint64_t oSites(void) const { return _odata_size; }; - accelerator_inline int Checkerboard(void) const { return checkerboard; }; - accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view - accelerator_inline void Conformable(GridBase * &grid) const - { - if (grid) conformable(grid, _grid); - else grid = _grid; - }; - - accelerator_inline void Advise(int advise) { -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - if (advise & AdviseInfrequentUse) { - cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); - } - if (advise & AdviseReadMostly) { - cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); - } -#endif -#endif - }; - - accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - int target; - cudaGetDevice(&target); - cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); -#endif -#endif - }; - - accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); -#endif -#endif - }; -}; - -///////////////////////////////////////////////////////////////////////////////////////// -// A View class which provides accessor to the data. -// This will be safe to call from accelerator_for and is trivially copy constructible -// The copy constructor for this will need to be used by device lambda functions -///////////////////////////////////////////////////////////////////////////////////////// -template -class LatticeView : public LatticeAccelerator -{ -public: - - - // Rvalue -#ifdef __CUDA_ARCH__ - accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } -#else - accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } -#endif - - accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; - accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - - accelerator_inline uint64_t begin(void) const { return 0;}; - accelerator_inline uint64_t end(void) const { return this->_odata_size; }; - accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - - LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me) - { - } -}; - -///////////////////////////////////////////////////////////////////////////////////////// -// Lattice expression types used by ET to assemble the AST -// -// Need to be able to detect code paths according to the whether a lattice object or not -// so introduce some trait type things -///////////////////////////////////////////////////////////////////////////////////////// - -class LatticeExpressionBase {}; - -template using is_lattice = std::is_base_of; -template using is_lattice_expr = std::is_base_of; - -template struct ViewMapBase { typedef T Type; }; -template struct ViewMapBase { typedef LatticeView Type; }; -template using ViewMap = ViewMapBase::value >; - -template -class LatticeUnaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - Op op; - T1 arg1; - LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {}; -}; - -template -class LatticeBinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - Op op; - T1 arg1; - T2 arg2; - LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {}; -}; - -template -class LatticeTrinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - typedef typename ViewMap<_T3>::Type T3; - Op op; - T1 arg1; - T2 arg2; - T3 arg3; - LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {}; -}; - ///////////////////////////////////////////////////////////////////////////////////////// // The real lattice class, with normal copy and assignment semantics. // This contains extra (host resident) grid pointer data that may be accessed by host code @@ -246,38 +73,33 @@ private: dealloc(); this->_odata_size = size; - if ( size ) + if ( size ) this->_odata = alloc.allocate(this->_odata_size); else this->_odata = nullptr; } } public: + + ///////////////////////////////////////////////////////////////////////////////// + // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents + ///////////////////////////////////////////////////////////////////////////////// + void SetViewMode(ViewMode mode) { + LatticeView accessor(*( (LatticeAccelerator *) this),mode); + accessor.ViewClose(); + } ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const // deprecated, should pick AcceleratorView for accelerator_for - { // and HostView for thread_for - LatticeView accessor(*( (LatticeAccelerator *) this)); + + LatticeView View (ViewMode mode) const + { + LatticeView accessor(*( (LatticeAccelerator *) this),mode); return accessor; } - LatticeView AcceleratorView(int mode = ViewReadWrite) const - { - LatticeView accessor(*( (LatticeAccelerator *) this)); - accessor.AcceleratorPrefetch(mode); - return accessor; - } - - LatticeView HostView(int mode = ViewReadWrite) const - { - LatticeView accessor(*( (LatticeAccelerator *) this)); - accessor.HostPrefetch(mode); - return accessor; - } - ~Lattice() { if ( this->_odata_size ) { dealloc(); @@ -297,12 +119,16 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - - auto me = AcceleratorView(ViewWrite); + + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeBinaryExpression &expr) @@ -317,11 +143,15 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = AcceleratorView(ViewWrite); + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) @@ -335,11 +165,15 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = AcceleratorView(ViewWrite); + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } //GridFromExpression is tricky to do @@ -390,10 +224,11 @@ public: } template inline Lattice & operator = (const sobj & r){ - auto me = View(); + auto me = View(CpuWrite); thread_for(ss,me.size(),{ - me[ss] = r; + me[ss]= r; }); + me.ViewClose(); return *this; } @@ -403,11 +238,12 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); this->checkerboard=0; + SetViewMode(mode); } // virtual ~Lattice(void) = default; @@ -445,11 +281,12 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = AcceleratorView(ViewWrite); - auto him= r.AcceleratorView(ViewRead); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } @@ -459,11 +296,12 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = AcceleratorView(ViewWrite); - auto him= r.AcceleratorView(ViewRead); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } /////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index f1126936..9f1155eb 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -51,34 +51,18 @@ template void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) { typedef decltype(basis[0]) Field; - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].AcceleratorView(ViewReadWrite); - Vector basis_v(basis.size(),tmp_v); - typedef typename std::remove_reference::type vobj; + typedef decltype(basis[0].View(AcceleratorRead)) View; + + Vector basis_v; basis_v.reserve(basis.size()); GridBase* grid = basis[0].Grid(); for(int k=0;k B(Nm); // Thread private - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead + typedef typename std::remove_reference::type vobj; + Vector Bt(siteBlock * nrot); auto Bp=&Bt[0]; @@ -96,7 +82,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) int j = i/Nm; int k = i%Nm; Qt_p[i]=Qt(j,k); - }); + }); // Block the loop to keep storage footprint down for(uint64_t s=0;s void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { - typedef decltype(basis[0].AcceleratorView()) View; + typedef decltype(basis[0].View(AcceleratorRead)) View; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.AcceleratorView(ViewWrite); - Vector basis_v(basis.size(),result_v); + + Vector basis_v; basis_v.reserve(basis.size()); for(int k=0;k Qt_jv(Nm); double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ auto B=coalescedRead(zz); for(int k=k0; k &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); + for(int k=0;k diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h index bbed2ef5..6a29be94 100644 --- a/Grid/lattice/Lattice_comparison.h +++ b/Grid/lattice/Lattice_comparison.h @@ -78,9 +78,9 @@ template inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); }); @@ -93,8 +93,8 @@ template inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, lhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs); }); @@ -107,8 +107,8 @@ template inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs,rhs_v[ss]); }); diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index a1abe58d..cd0f11ee 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,7 +37,7 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - auto l_v = l.View(); + autoView(l_v, l, CpuWrite); thread_for( o, grid->oSites(), { vector_type vI; Coordinate gcoor; @@ -51,23 +51,5 @@ template inline void LatticeCoordinate(Lattice &l,int mu) }); }; -// LatticeCoordinate(); -// FIXME for debug; deprecate this; made obscelete by -template void lex_sites(Lattice &l){ - auto l_v = l.View(); - Real *v_ptr = (Real *)&l_v[0]; - size_t o_len = l.Grid()->oSites(); - size_t v_len = sizeof(vobj)/sizeof(vRealF); - size_t vec_len = vRealF::Nsimd(); - - for(int i=0;i inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); }); @@ -56,9 +56,9 @@ template inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); }); @@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Latt typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(rr())) srr; Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),1,{ // FIXME had issues with scalar version of outer // Use vector [] operator and don't read coalesce this loop diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 0980ad8a..7c470fef 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto Y_v = Y.View(); - auto R_v = R.View(); + autoView( X_v , X, CpuRead); + autoView( Y_v , Y, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { std::vector s_x(Nblock); @@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto R_v = R.View(); + autoView( X_v , X, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { @@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + autoView( lhs_v , lhs, CpuRead); + autoView( rhs_v , rhs, CpuRead); thread_region { std::vector Left(Nblock); std::vector Right(Nblock); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index 8f649bd7..c79becf2 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -46,9 +46,9 @@ auto PeekIndex(const Lattice &lhs,int i) -> Lattice(vobj(),i))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + accelerator_for( ss, lhs_v.size(), 1, { ret_v[ss] = peekIndex(lhs_v[ss],i); }); return ret; @@ -58,9 +58,9 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(vobj(),i,j))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + accelerator_for( ss, lhs_v.size(), 1, { ret_v[ss] = peekIndex(lhs_v[ss],i,j); }); return ret; @@ -72,18 +72,18 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice void PokeIndex(Lattice &lhs,const Lattice(vobj(),0))> & rhs,int i) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + accelerator_for( ss, lhs_v.size(), 1, { pokeIndex(lhs_v[ss],rhs_v[ss],i); }); } template void PokeIndex(Lattice &lhs,const Lattice(vobj(),0,0))> & rhs,int i,int j) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + accelerator_for( ss, lhs_v.size(), 1, { pokeIndex(lhs_v[ss],rhs_v[ss],i,j); }); } @@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ // extract-modify-merge cycle is easiest way and this is not perf critical ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + autoView( l_v , l, CpuWrite); if ( rank == grid->ThisRank() ) { extract(l_v[odx],buf); buf[idx] = s; @@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + autoView( l_v , l, CpuWrite); extract(l_v[odx],buf); s = buf[idx]; @@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ return; }; - ////////////////////////////////////////////////////////// // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// +// Must be CPU read view template -inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ - - GridBase *grid = l.Grid(); - +inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) +{ + GridBase *grid = l.getGrid(); + assert(l.mode==CpuRead); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site){ return; }; - +// Must be CPU write view template -inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ - - GridBase *grid=l.Grid(); +inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) +{ + GridBase *grid=l.getGrid(); + assert(l.mode==CpuWrite); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -202,13 +202,11 @@ inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + ret.Checkerboard()=lhs.Checkerboard(); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], adj(lhs_v(ss))); }); @@ -51,9 +53,11 @@ template inline Lattice adj(const Lattice &lhs){ template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + ret.Checkerboard() = lhs.Checkerboard(); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3147823d..c2955485 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -25,7 +25,7 @@ Author: Christoph Lehner #include -#ifdef GRID_NVCC +#if defined(GRID_CUDA)||defined(GRID_HIP) #include #endif @@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) { typedef typename vobj::scalar_object sobj; - const int Nsimd = vobj::Nsimd(); + // const int Nsimd = vobj::Nsimd(); + const int nthread = GridThread::GetThreads(); + + Vector sumarray(nthread); + for(int i=0;i +inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) +{ + typedef typename vobj::scalar_objectD sobj; + const int nthread = GridThread::GetThreads(); Vector sumarray(nthread); @@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) ssum = ssum+sumarray[i]; } - return ssum; + typedef typename vobj::scalar_object ssobj; + ssobj ret = ssum; + return ret; } + + template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#ifdef GRID_NVCC +#if defined(GRID_CUDA)||defined(GRID_HIP) return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); #endif } +template +inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) +{ +#if defined(GRID_CUDA)||defined(GRID_HIP) + return sumD_gpu(arg,osites); +#else + return sumD_cpu(arg,osites); +#endif +} + template inline typename vobj::scalar_object sum(const Lattice &arg) { - auto arg_v = arg.View(); +#if defined(GRID_CUDA)||defined(GRID_HIP) + autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum(&arg_v[0],osites); + auto ssum= sum_gpu(&arg_v[0],osites); +#else + autoView(arg_v, arg, CpuRead); + Integer osites = arg.Grid()->oSites(); + auto ssum= sum_cpu(&arg_v[0],osites); +#endif arg.Grid()->GlobalSum(ssum); return ssum; } @@ -101,43 +150,30 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & ComplexD nrm; GridBase *grid = left.Grid(); - - // Might make all code paths go this way. - auto left_v = left.AcceleratorView(ViewRead); - auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC - // GPU - SIMT lane compliance... - typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + // Might make all code paths go this way. + typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - + + { + autoView( left_v , left, AcceleratorRead); + autoView( right_v,right, AcceleratorRead); - accelerator_for( ss, sites, nsimd,{ - auto x_l = left_v(ss); - auto y_l = right_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); - }) + // GPU - SIMT lane compliance... + accelerator_for( ss, sites, 1,{ + auto x_l = left_v[ss]; + auto y_l = right_v[ss]; + inner_tmp_v[ss]=innerProductD(x_l,y_l); + }); + } // This is in single precision and fails some tests - // Need a sumD that sums in double - nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites)); -#else - // CPU - typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto x_l = left_v[ss]; - auto y_l = right_v[ss]; - inner_tmp_v[ss]=innerProductD(x_l,y_l); - }) - nrm = TensorRemove(sum(inner_tmp_v,sites)); -#endif + auto anrm = sum(inner_tmp_v,sites); + nrm = anrm; return nrm; } @@ -175,40 +211,24 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.AcceleratorView(ViewRead); - auto y_v=y.AcceleratorView(ViewRead); - auto z_v=z.AcceleratorView(ViewWrite); - const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC // GPU - typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); - }); - - nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites))); -#else - // CPU typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); + + accelerator_for( ss, sites, 1,{ + auto tmp = a*x_v[ss]+b*y_v[ss]; inner_tmp_v[ss]=innerProductD(tmp,tmp); z_v[ss]=tmp; }); - // Already promoted to double nrm = real(TensorRemove(sum(inner_tmp_v,sites))); -#endif grid->GlobalSum(nrm); return nrm; } @@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti GridBase *grid = left.Grid(); - auto left_v=left.AcceleratorView(ViewRead); - auto right_v=right.AcceleratorView(ViewRead); - const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC // GPU - typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; - typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; + typedef decltype(innerProductD(vobj(),vobj())) inner_t; + typedef decltype(innerProductD(vobj(),vobj())) norm_t; Vector inner_tmp(sites); - Vector norm_tmp(sites); + Vector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; + { + autoView(left_v,left, AcceleratorRead); + autoView(right_v,right,AcceleratorRead); + accelerator_for( ss, sites, 1,{ + auto left_tmp = left_v[ss]; + inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]); + norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp); + }); + } - accelerator_for( ss, sites, nsimd,{ - auto left_tmp = left_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); - coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); - }); - - tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites)); - tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites)); -#else - // CPU - typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; - typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t; - Vector inner_tmp(sites); - Vector norm_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - auto norm_tmp_v = &norm_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto left_tmp = left_v(ss); - inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss)); - norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp); - }); - // Already promoted to double tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); -#endif + grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector ip = tmp[0]; nrm = real(tmp[1]); @@ -335,7 +337,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction - auto Data_v=Data.View(); + autoView( Data_v, Data, CpuRead); thread_for( r,rd, { int so=r*grid->_ostride[orthogdim]; // base offset for start of plane for(int n=0;n & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - auto lhv=lhs.View(); - auto rhv=rhs.View(); + autoView( lhv, lhs, CpuRead); + autoView( rhv, rhs, CpuRead); thread_for( r,rd,{ int so=r*grid->_ostride[orthogdim]; // base offset for start of plane @@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice tensor_reduced at; at=av; - auto Rv=R.View(); - auto Xv=X.View(); - auto Yv=Y.View(); - thread_for_collapse(2, n, e1, { - for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v=X.View(); - auto Y_v=Y.View(); - auto R_v=R.View(); + autoView( X_v, X, CpuRead); + autoView( Y_v, Y, CpuRead); + autoView( R_v, R, CpuWrite); thread_region { Vector s_x(Nblock); @@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< // int nl=1; //FIXME package in a convenient iterator + // thread_for2d_in_region //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto R_v = R.View(); - auto X_v = X.View(); + autoView( R_v, R, CpuWrite); + autoView( X_v, X, CpuRead); thread_region { std::vector s_x(Nblock); @@ -692,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v=lhs.View(); - auto rhs_v=rhs.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); thread_region { std::vector Left(Nblock); diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c5d75356..5f490507 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -1,7 +1,13 @@ NAMESPACE_BEGIN(Grid); -#define WARP_SIZE 32 +#ifdef GRID_HIP +extern hipDeviceProp_t *gpu_props; +#endif +#ifdef GRID_CUDA extern cudaDeviceProp *gpu_props; +#endif + +#define WARP_SIZE 32 __device__ unsigned int retirementCount = 0; template @@ -19,7 +25,12 @@ template void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { int device; +#ifdef GRID_CUDA cudaGetDevice(&device); +#endif +#ifdef GRID_HIP + hipGetDevice(&device); +#endif Iterator warpSize = gpu_props[device].warpSize; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; @@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { sobj *smem = (sobj *)shmem_pointer; // wait until all outstanding memory instructions in this thread are finished - __threadfence(); + acceleratorFence(); if (tid==0) { unsigned int ticket = atomicInc(&retirementCount, gridDim.x); @@ -156,8 +167,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { } // each thread must read the correct value of amLast - __syncthreads(); - + acceleratorSynchroniseAll(); + if (amLast) { // reduce buffer[0], ..., buffer[gridDim.x-1] Iterator i = tid; @@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) sobj *buffer_v = &buffer[0]; reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); - cudaDeviceSynchronize(); - - cudaError err = cudaGetLastError(); - if ( cudaSuccess != err ) { - printf("Cuda error %s\n",cudaGetErrorString( err )); - exit(0); - } + accelerator_barrier(); auto result = buffer_v[0]; return result; } diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 1bb1f087..e5e63716 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -375,7 +375,7 @@ public: int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int words = sizeof(scalar_object) / sizeof(scalar_type); - auto l_v = l.View(); + autoView(l_v, l, CpuWrite); thread_for( ss, osites, { ExtractBuffer buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times @@ -461,8 +461,8 @@ public: } { - // Obtain one reseeded generator per thread - int Nthread = GridThread::GetThreads(); + // Obtain one reseeded generator per thread + int Nthread = 32; // Hardwire a good level or parallelism std::vector seeders(Nthread); for(int t=0;t inline auto trace(const Lattice &lhs) -> Lattice { Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView(ret_v , ret, AcceleratorWrite); + autoView(lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], trace(lhs_v(ss))); }); @@ -58,8 +58,8 @@ template inline auto TraceIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], traceIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c23ddcdc..beceecc9 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine) //////////////////////////////////////////////////////////////////////////////////////////// // remove and insert a half checkerboard //////////////////////////////////////////////////////////////////////////////////////////// -template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ +template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full) +{ half.Checkerboard() = cb; - auto half_v = half.View(); - auto full_v = full.View(); + autoView( half_v, half, CpuWrite); + autoView( full_v, full, CpuRead); thread_for(ss, full.Grid()->oSites(),{ int cbos; Coordinate coor; @@ -64,11 +65,11 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } - -template inline void setCheckerboard(Lattice &full,const Lattice &half){ +template inline void setCheckerboard(Lattice &full,const Lattice &half) +{ int cb = half.Checkerboard(); - auto half_v = half.View(); - auto full_v = full.View(); + autoView( half_v , half, CpuRead); + autoView( full_v , full, CpuWrite); thread_for(ss,full.Grid()->oSites(),{ Coordinate coor; @@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & out = in; } -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { - ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; + ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; } accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { - ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; + ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in; } accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { - ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; + ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in; } #endif @@ -151,12 +152,11 @@ accelerator_inline void convertType(T & out, const T & in) { template accelerator_inline void convertType(Lattice & out, const Lattice & in) { - auto out_v = out.AcceleratorView(ViewWrite); - auto in_v = in.AcceleratorView(ViewRead); - + autoView( out_v , out,AcceleratorWrite); + autoView( in_v , in ,AcceleratorRead); accelerator_for(ss,out_v.size(),T1::Nsimd(),{ convertType(out_v[ss],in_v(ss)); - }); + }); } //////////////////////////////////////////////////////////////////////////////////////////// @@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice & out, const Lattice & in) { //////////////////////////////////////////////////////////////////////////////////////////// template inline auto localInnerProductD(const Lattice &lhs,const Lattice &rhs) --> Lattice> +-> Lattice> { - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; Lattice> ret(lhs.Grid()); - auto ret_v = ret.AcceleratorView(ViewWrite); - accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ + { + autoView(ret_v, ret,AcceleratorWrite); + accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); }); - + } return ret; } @@ -194,14 +195,13 @@ inline void blockProject(Lattice > &coarseData, Lattice> ip(coarse); Lattice fineDataRed = fineData; - // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.AcceleratorView(ViewWrite); - auto ip_ = ip.AcceleratorView(ViewReadWrite); + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( ip_ , ip, AcceleratorWrite); for(int v=0;v accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { convertType(coarseData_[sc](v),ip_[sc]); - }); + }); // improve numerical stability of projection // |fine> = |fine> - |basis> @@ -210,68 +210,6 @@ inline void blockProject(Lattice > &coarseData, } } -template -inline void blockProject1(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) -{ - typedef iVector coarseSiteData; - coarseSiteData elide; - typedef decltype(coalescedRead(elide)) ScalarComplex; - GridBase * fine = fineData.Grid(); - GridBase * coarse= coarseData.Grid(); - int _ndimension = coarse->_ndimension; - - // checks - assert( nbasis == Basis.size() ); - subdivides(coarse,fine); - for(int i=0;i_rdimensions[d] / coarse->_rdimensions[d]; - assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); - } - int blockVol = fine->oSites()/coarse->oSites(); - - coarseData=Zero(); - - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - //////////////////////////////////////////////////////////////////////////////////////////////////////// - // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. - // Otherwise do fine inner product per site, and make the update atomic - //////////////////////////////////////////////////////////////////////////////////////////////////////// - accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { - - auto sc=sci/nbasis; - auto i=sci%nbasis; - auto Basis_ = Basis[i].View(); - - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - - int sf; - decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); - - for(int sb=0;sb_rdimensions); - - reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); - } - coalescedWrite(coarseData_[sc](i),reduce); - }); - return; -} template inline void blockZAXPY(Lattice &fineZ, @@ -298,10 +236,10 @@ template assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.AcceleratorView(ViewWrite); - auto fineX_ = fineX.AcceleratorView(ViewRead); - auto fineY_ = fineY.AcceleratorView(ViewRead); - auto coarseA_= coarseA.AcceleratorView(ViewRead); + autoView( fineZ_ , fineZ, AcceleratorWrite); + autoView( fineX_ , fineX, AcceleratorRead); + autoView( fineY_ , fineY, AcceleratorRead); + autoView( coarseA_, coarseA, AcceleratorRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { @@ -314,7 +252,7 @@ template Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT typename vobj2::tensor_reduced::scalar_object cA; typename vobj::scalar_object cAx; #else @@ -344,15 +282,16 @@ template Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); Lattice coarse_inner(coarse); - auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); - auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); - // Precision promotion - fine_inner = localInnerProductD(fineX,fineY); + fine_inner = localInnerProductD(fineX,fineY); blockSum(coarse_inner,fine_inner); - accelerator_for(ss, coarse->oSites(), 1, { + { + autoView( CoarseInner_ , CoarseInner,AcceleratorWrite); + autoView( coarse_inner_ , coarse_inner,AcceleratorRead); + accelerator_for(ss, coarse->oSites(), 1, { convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); }); + } } @@ -370,14 +309,15 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); - auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); - fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - accelerator_for(ss, coarse->oSites(), 1, { - CoarseInner_[ss] = coarse_inner_[ss]; - }); + { + autoView( CoarseInner_ , CoarseInner, AcceleratorWrite); + autoView( coarse_inner_ , coarse_inner, AcceleratorRead); + accelerator_for(ss, coarse->oSites(), 1, { + CoarseInner_[ss] = coarse_inner_[ss]; + }); + } } template @@ -408,8 +348,10 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) } int blockVol = fine->oSites()/coarse->oSites(); - auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); - auto fineData_ = fineData.AcceleratorView(ViewRead); + // Turn this around to loop threaded over sc and interior loop + // over sf would thread better + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( fineData_ , fineData, AcceleratorRead); accelerator_for(sc,coarse->oSites(),1,{ @@ -510,8 +452,8 @@ inline void blockPromote(const Lattice > &coarseData, for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); + autoView( fineData_ , fineData, AcceleratorWrite); + autoView( coarseData_ , coarseData, AcceleratorRead); // Loop with a cache friendly loop ordering accelerator_for(sf,fine->oSites(),1,{ @@ -524,7 +466,7 @@ inline void blockPromote(const Lattice > &coarseData, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); for(int i=0;i > &coarseData, fineData=Zero(); for(int i=0;i > ip = PeekIndex<0>(coarseData,i); - auto ip_ = ip.AcceleratorView(ViewRead); + + //Lattice cip(coarse); + //autoView( cip_ , cip, AcceleratorWrite); + //autoView( ip_ , ip, AcceleratorRead); + //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ + // coalescedWrite(cip_[sc], ip_(sc)()); + // }); + //blockZAXPY(fineData,cip,Basis[i],fineData); blockZAXPY(fineData,ip,Basis[i],fineData); } } @@ -571,15 +520,17 @@ void localConvert(const Lattice &in,Lattice &out) assert(ig->lSites() == og->lSites()); } + autoView(in_v,in,CpuRead); + autoView(out_v,out,CpuWrite); thread_for(idx, ig->lSites(),{ sobj s; ssobj ss; Coordinate lcoor(ni); ig->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(s,in,lcoor); + peekLocalSite(s,in_v,lcoor); ss=s; - pokeLocalSite(ss,out,lcoor); + pokeLocalSite(ss,out_v,lcoor); }); } @@ -614,8 +565,9 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.AcceleratorView(ViewWrite); - auto f_v = From.AcceleratorView(ViewRead); + + autoView( t_v , To, AcceleratorWrite); + autoView( f_v , From, AcceleratorRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); @@ -638,8 +590,6 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int w=0;w &lowDim,Lattice & higherDim,int slice } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -682,8 +634,8 @@ void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); }); } @@ -711,6 +663,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -723,8 +677,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); }); } @@ -752,6 +706,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -760,8 +716,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); } }); } @@ -789,6 +745,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -797,8 +755,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); } }); } @@ -862,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) } //loop over outer index - auto in_v = in.View(); + autoView( in_v , in, CpuRead); thread_for(in_oidx,in_grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray out_ptrs(in_nsimd); @@ -955,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) icoor[lane].resize(ndim); grid->iCoorFromIindex(icoor[lane],lane); } - auto out_v = out.View(); + autoView( out_v , out, CpuWrite); thread_for(oidx, grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray ptrs(nsimd); @@ -1058,7 +1016,7 @@ void precisionChange(Lattice &out, const Lattice &in) std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); - auto out_v = out.View(); + autoView( out_v , out, CpuWrite); thread_for(out_oidx,out_grid->oSites(),{ Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h index 6fe08c10..adfe3380 100644 --- a/Grid/lattice/Lattice_transpose.h +++ b/Grid/lattice/Lattice_transpose.h @@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid); template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); }); @@ -58,8 +58,8 @@ template inline auto TransposeIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss] , transposeIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_unary.h b/Grid/lattice/Lattice_unary.h index 591afe72..07424b3d 100644 --- a/Grid/lattice/Lattice_unary.h +++ b/Grid/lattice/Lattice_unary.h @@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid); template Lattice pow(const Lattice &rhs_i,RealD y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs, rhs_i, AcceleratorRead); + autoView( ret, ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),1,{ ret[ss]=pow(rhs[ss],y); @@ -45,8 +45,8 @@ template Lattice pow(const Lattice &rhs_i,RealD y){ } template Lattice mod(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],mod(rhs(ss),y)); @@ -56,8 +56,8 @@ template Lattice mod(const Lattice &rhs_i,Integer y){ template Lattice div(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto ret = ret_i.View(); - auto rhs = rhs_i.View(); + autoView( ret , ret_i, AcceleratorWrite); + autoView( rhs , rhs_i, AcceleratorRead); ret.Checkerboard() = rhs_i.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],div(rhs(ss),y)); @@ -67,8 +67,8 @@ template Lattice div(const Lattice &rhs_i,Integer y){ template Lattice expMat(const Lattice &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h new file mode 100644 index 00000000..3b76b921 --- /dev/null +++ b/Grid/lattice/Lattice_view.h @@ -0,0 +1,168 @@ +#pragma once +NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////////////////// +// Base class which can be used by traits to pick up behaviour +/////////////////////////////////////////////////////////////////// +class LatticeBase {}; + +///////////////////////////////////////////////////////////////////////////////////////// +// Conformable checks; same instance of Grid required +///////////////////////////////////////////////////////////////////////////////////////// +void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) +{ + assert(lhs == rhs); +} + +//////////////////////////////////////////////////////////////////////////// +// Minimal base class containing only data valid to access from accelerator +// _odata will be a managed pointer in CUDA +//////////////////////////////////////////////////////////////////////////// +// Force access to lattice through a view object. +// prevents writing of code that will not offload to GPU, but perhaps annoyingly +// strict since host could could in principle direct access through the lattice object +// Need to decide programming model. +#define LATTICE_VIEW_STRICT +template class LatticeAccelerator : public LatticeBase +{ +protected: + //public: + GridBase *_grid; + int checkerboard; + vobj *_odata; // A managed pointer + uint64_t _odata_size; + ViewAdvise advise; +public: + accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; + accelerator_inline uint64_t oSites(void) const { return _odata_size; }; + accelerator_inline int Checkerboard(void) const { return checkerboard; }; + accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view + accelerator_inline ViewAdvise Advise(void) const { return advise; }; + accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view + accelerator_inline void Conformable(GridBase * &grid) const + { + if (grid) conformable(grid, _grid); + else grid = _grid; + }; + // Host only + GridBase * getGrid(void) const { return _grid; }; +}; + +///////////////////////////////////////////////////////////////////////////////////////// +// A View class which provides accessor to the data. +// This will be safe to call from accelerator_for and is trivially copy constructible +// The copy constructor for this will need to be used by device lambda functions +///////////////////////////////////////////////////////////////////////////////////////// +template +class LatticeView : public LatticeAccelerator +{ +public: + // Rvalue + ViewMode mode; + void * cpu_ptr; +#ifdef GRID_SIMT + accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { + return coalescedRead(this->_odata[i]); + } +#else + accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } +#endif + + accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; + + accelerator_inline uint64_t begin(void) const { return 0;}; + accelerator_inline uint64_t end(void) const { return this->_odata_size; }; + accelerator_inline uint64_t size(void) const { return this->_odata_size; }; + + LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} + LatticeView(const LatticeView &refer_to_me) = default; // Trivially copyable + LatticeView(const LatticeAccelerator &refer_to_me,ViewMode mode) : LatticeAccelerator (refer_to_me) + { + this->ViewOpen(mode); + } + + // Host functions + void ViewOpen(ViewMode mode) + { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base + // std::cout << "View Open"<_odata<cpu_ptr = (void *)this->_odata; + this->mode = mode; + this->_odata =(vobj *) + MemoryManager::ViewOpen(this->cpu_ptr, + this->_odata_size*sizeof(vobj), + mode, + this->advise); + } + void ViewClose(void) + { // Inform the manager + // std::cout << "View Close"<cpu_ptr<cpu_ptr,this->mode); + } + +}; +// Little autoscope assister +template +class ViewCloser +{ + View v; // Take a copy of view and call view close when I go out of scope automatically + public: + ViewCloser(View &_v) : v(_v) {}; + ~ViewCloser() { v.ViewClose(); } +}; + +#define autoView(l_v,l,mode) \ + auto l_v = l.View(mode); \ + ViewCloser _autoView##l_v(l_v); + +///////////////////////////////////////////////////////////////////////////////////////// +// Lattice expression types used by ET to assemble the AST +// +// Need to be able to detect code paths according to the whether a lattice object or not +// so introduce some trait type things +///////////////////////////////////////////////////////////////////////////////////////// + +class LatticeExpressionBase {}; + +template using is_lattice = std::is_base_of; +template using is_lattice_expr = std::is_base_of; + +template struct ViewMapBase { typedef T Type; }; +template struct ViewMapBase { typedef LatticeView Type; }; +template using ViewMap = ViewMapBase::value >; + +template +class LatticeUnaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + Op op; + T1 arg1; + LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {}; +}; + +template +class LatticeBinaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + typedef typename ViewMap<_T2>::Type T2; + Op op; + T1 arg1; + T2 arg2; + LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {}; +}; + +template +class LatticeTrinaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + typedef typename ViewMap<_T2>::Type T2; + typedef typename ViewMap<_T3>::Type T3; + Op op; + T1 arg1; + T2 arg2; + T3 arg3; + LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {}; +}; +NAMESPACE_END(Grid); diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index 1e2a9528..dd25b41e 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -44,7 +44,7 @@ Author: paboyle #include #endif #ifdef __x86_64__ -#ifdef GRID_NVCC +#ifdef GRID_CUDA accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; } #else @@ -112,7 +112,6 @@ class PerformanceCounter { private: typedef struct { - public: uint32_t type; uint64_t config; const char *name; diff --git a/Grid/pugixml/pugixml.cc b/Grid/pugixml/pugixml.cc index e7b395ad..45e6496a 100644 --- a/Grid/pugixml/pugixml.cc +++ b/Grid/pugixml/pugixml.cc @@ -12773,7 +12773,7 @@ namespace pugi #undef PUGI__THROW_ERROR #undef PUGI__CHECK_ERROR -#ifdef GRID_NVCC +#ifdef GRID_CUDA #pragma pop #endif diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h index 890c680b..0c8a0930 100644 --- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h +++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h @@ -114,19 +114,22 @@ public: U = adj(Cshift(U, mu, -1)); PokeIndex(Uadj, U, mu); } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + + autoView(Umu_v,Umu,CpuRead); + autoView(Uadj_v,Uadj,CpuRead); + autoView(Uds_v,Uds,CpuWrite); + thread_for( lidx, GaugeGrid->lSites(), { Coordinate lcoor; GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUmu, Umu, lcoor); + peekLocalSite(ScalarUmu, Umu_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - peekLocalSite(ScalarUmu, Uadj, lcoor); + peekLocalSite(ScalarUmu, Uadj_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - pokeLocalSite(ScalarUds, Uds, lcoor); - } + pokeLocalSite(ScalarUds, Uds_v, lcoor); + }); } inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index fb6f18bb..16252340 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); +#include #include #include NAMESPACE_CHECK(Staggered); @@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; +typedef NaiveStaggeredFermion NaiveStaggeredFermionR; +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; -#ifndef GRID_NVCC +#ifndef GRID_CUDA typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b147b3f..0b726db9 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -96,11 +96,11 @@ public: int sl = St._simd_layout[direction]; Coordinate icoor; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT _Spinor tmp; const int Nsimd =SiteDoubledGaugeField::Nsimd(); - int s = SIMTlane(Nsimd); + int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; @@ -232,15 +232,17 @@ public: if ( Params.twists[mu] ) { Uconj = where(coor==neglink,-Uconj,Uconj); } - - auto U_v = U.View(); - auto Uds_v = Uds.View(); - auto Uconj_v = Uconj.View(); - auto Utmp_v= Utmp.View(); - thread_foreach(ss,U_v,{ - Uds_v[ss](0)(mu) = U_v[ss](); - Uds_v[ss](1)(mu) = Uconj_v[ss](); - }); + + { + autoView( U_v , U, CpuRead); + autoView( Uconj_v , Uconj, CpuRead); + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,U_v,{ + Uds_v[ss](0)(mu) = U_v[ss](); + Uds_v[ss](1)(mu) = Uconj_v[ss](); + }); + } U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary Uconj = adj(Cshift(Uconj,mu,-1)); @@ -250,19 +252,25 @@ public: Utmp = where(coor==0,Uconj,Utmp); } - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](0)(mu+4) = Utmp_v[ss](); - }); - + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](0)(mu+4) = Utmp_v[ss](); + }); + } Utmp = Uconj; if ( Params.twists[mu] ) { Utmp = where(coor==0,U,Utmp); } - - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](1)(mu+4) = Utmp_v[ss](); - }); - + + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](1)(mu+4) = Utmp_v[ss](); + }); + } } } @@ -272,11 +280,14 @@ public: GaugeLinkField link(mat.Grid()); // use lorentz for flavour as hack. auto tmp = TraceIndex(outerProduct(Btilde, A)); - auto link_v = link.View(); - auto tmp_v = tmp.View(); - thread_foreach(ss,tmp_v,{ - link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); - }); + + { + autoView( link_v , link, CpuWrite); + autoView( tmp_v , tmp, CpuRead); + thread_foreach(ss,tmp_v,{ + link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); + }); + } PokeIndex(mat, link, mu); return; } @@ -306,16 +317,18 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Atilde_v = Atilde.View(); - auto Btilde_v = Btilde.View(); - thread_for(ss,tmp.Grid()->oSites(),{ - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); - tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - }); + { + autoView( tmp_v , tmp, CpuWrite); + autoView( Atilde_v , Atilde, CpuRead); + autoView( Btilde_v , Btilde, CpuRead); + thread_for(ss,tmp.Grid()->oSites(),{ + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); + tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + }); + } PokeIndex(mat, tmp, mu); return; } diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index d1bb0e9c..625eda63 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -61,8 +61,8 @@ public: double DhopCalls; double DhopCommTime; double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h new file mode 100644 index 00000000..ca38a64f --- /dev/null +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -0,0 +1,194 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_QCD_NAIVE_STAG_FERMION_H +#define GRID_QCD_NAIVE_STAG_FERMION_H + +NAMESPACE_BEGIN(Grid); + +class NaiveStaggeredFermionStatic { +public: + static const std::vector directions; + static const std::vector displacements; + static const int npoint = 8; +}; + +template +class NaiveStaggeredFermion : public StaggeredKernels, public NaiveStaggeredFermionStatic { +public: + INHERIT_IMPL_TYPES(Impl); + typedef StaggeredKernels Kernels; + + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _grid; } + GridBase *GaugeRedBlackGrid(void) { return _cbgrid; } + GridBase *FermionGrid(void) { return _grid; } + GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + + ////////////////////////////////////////////////////////////////// + // override multiply; cut number routines if pass dagger argument + // and also make interface more uniformly consistent + ////////////////////////////////////////////////////////////////// + void M(const FermionField &in, FermionField &out); + void Mdag(const FermionField &in, FermionField &out); + + ///////////////////////////////////////////////////////// + // half checkerboard operations + ///////////////////////////////////////////////////////// + void Meooe(const FermionField &in, FermionField &out); + void MeooeDag(const FermionField &in, FermionField &out); + void Mooee(const FermionField &in, FermionField &out); + void MooeeDag(const FermionField &in, FermionField &out); + void MooeeInv(const FermionField &in, FermionField &out); + void MooeeInvDag(const FermionField &in, FermionField &out); + + //////////////////////// + // Derivative interface + //////////////////////// + // Interface calls an internal routine + void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + + /////////////////////////////////////////////////////////////// + // non-hermitian hopping term; half cb or both + /////////////////////////////////////////////////////////////// + void Dhop (const FermionField &in, FermionField &out, int dag); + void DhopOE(const FermionField &in, FermionField &out, int dag); + void DhopEO(const FermionField &in, FermionField &out, int dag); + + /////////////////////////////////////////////////////////////// + // Multigrid assistance; force term uses too + /////////////////////////////////////////////////////////////// + void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); + void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); + + /////////////////////////////////////////////////////////////// + // Extra methods added by derived + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl &st, + DoubledGaugeField &U, + GaugeField &mat, + const FermionField &A, const FermionField &B, int dag); + + void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// + NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + NaiveStaggeredFermion(GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + + // DoubleStore impl dependent + void ImportGauge (const GaugeField &_U ); + DoubledGaugeField &GetU(void) { return Umu ; } ; + void CopyGaugeCheckerboards(void); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + + // protected: +public: + // any other parameters of action ??? + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } + RealD mass; + RealD u0; + RealD c1; + + GridBase *_grid; + GridBase *_cbgrid; + + // Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + /////////////////////////////////////////////////////////////// + // Conserved current utilities + /////////////////////////////////////////////////////////////// + void ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu); + void SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &srct, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx); +}; + +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 6ef0ab9d..30deee06 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -47,23 +47,34 @@ template class StaggeredKernels : public FermionOperator , pub INHERIT_IMPL_TYPES(Impl); typedef FermionOperator Base; -public: - - void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + public: + + void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + + void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, + int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + protected: /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -71,15 +82,18 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -87,27 +101,10 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - /////////////////////////////////////////////////////////////////////////////////////////////////// - // Generic interface; fan out to right routine - /////////////////////////////////////////////////////////////////////////////////////////////////// - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior); public: diff --git a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h index 2d4de18e..18fe993c 100644 --- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h +++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h @@ -113,20 +113,7 @@ public: inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) { - GridBase *GaugeGrid = U_ds.Grid(); - thread_for(lidx, GaugeGrid->lSites(),{ - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - Coordinate lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, U_ds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - }); + assert(0); } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 4b25d00e..91ad6d6d 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -257,15 +257,16 @@ private: CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO + public: // eventually these can be compressed into 6x6 blocks instead of the 12x12 // using the DeGrand-Rossi basis for the gamma matrices CloverFieldType fillCloverYZ(const GaugeLinkField &F) { CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); @@ -281,9 +282,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v, T,AcceleratorWrite); + autoView(F_v, F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()(); @@ -299,9 +300,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()()); @@ -317,9 +318,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T, AcceleratorWrite); + autoView( F_v , F, AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()()); @@ -335,9 +336,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v ,T,AcceleratorWrite); + autoView( F_v ,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()()); @@ -354,9 +355,9 @@ private: T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T,AcceleratorWrite); + autoView( F_v , F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index e78023cf..52e1ee00 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,10 +106,10 @@ public: const _SpinorField & phi, int mu) { - auto out_v= out.View(); - auto phi_v= phi.View(); - auto Umu_v= Umu.View(); - thread_for(sss,out.Grid()->oSites(),{ + autoView( out_v, out, AcceleratorWrite); + autoView( phi_v, phi, AcceleratorRead); + autoView( Umu_v, Umu, AcceleratorRead); + accelerator_for(sss,out.Grid()->oSites(),1,{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); }); } @@ -191,18 +191,19 @@ public: int Ls=Btilde.Grid()->_fdimensions[0]; GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Btilde_v = Btilde.View(); - auto Atilde_v = Atilde.View(); - thread_for(sss,tmp.Grid()->oSites(),{ - int sU=sss; - for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here - } - }); + { + autoView( tmp_v , tmp, AcceleratorWrite); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,tmp.Grid()->oSites(),1,{ + int sU=sss; + for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here + } + }); + } PokeIndex(mat,tmp,mu); - } }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e9675b36..e79b64dc 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -180,7 +180,7 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; -#ifdef GRID_NVCC +#ifdef GRID_CUDA RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index dbdf134b..d2537ccf 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -50,9 +50,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -93,9 +93,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -131,8 +131,8 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); int Ls=this->Ls; @@ -193,8 +193,8 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); int Ls=this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); auto plee = & lee [0]; auto pdee = & dee [0]; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h index 034ce642..b54f63ad 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h @@ -65,9 +65,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi, psi_i,CpuRead); + autoView(phi, phi_i,CpuRead); + autoView(chi, chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd= Simd::Nsimd(); @@ -213,9 +213,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi=psi_i.View(); - auto phi=phi_i.View(); - auto chi=chi_i.View(); + autoView(psi,psi_i,CpuRead); + autoView(phi,phi_i,CpuRead); + autoView(chi,chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd= Simd::Nsimd(); @@ -357,8 +357,8 @@ CayleyFermion5D::MooeeInternalAsm(const FermionField &psi_i, FermionField Vector > &Matm) { EnableIf sfinae=0; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); #ifndef AVX512 { SiteHalfSpinor BcastP; @@ -535,8 +535,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField EnableIf sfinae=0; #ifndef AVX512 { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -586,8 +586,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField } #else { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); // pointers // MASK_REGS; #define Chi_00 %zmm0 diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 46d3fa1f..9a8454ef 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -46,9 +46,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; GridBase* grid = psi_i.Grid(); - auto phi = phi_i.View(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView( phi , phi_i, AcceleratorRead); + autoView( psi , psi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -82,9 +82,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio GridBase* grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView( psi , psi_i, AcceleratorRead); + autoView( phi , phi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -116,8 +116,8 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi=psi_i.View(); - auto chi=chi_i.View(); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; @@ -172,8 +172,8 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 44a201c1..888691c4 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionFi Compressor compressor; Stencil.HaloExchange(in,compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( ss,Umu.Grid()->oSites(),{ for(int s=0;s::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } @@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; @@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DhopFaceTime-=usecond(); st.Prepare(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - double ctime=0; - double ptime=0; - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Remove explicit thread mapping introduced for OPA reasons. ////////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + DhopComputeTime-=usecond(); { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - if (tid >= ncomms) { - double start = usecond(); - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = U.Grid()->oSites(); // 4d vol - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<--------- - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------ - } - } - ptime = usecond() - start; - } else { - double start = usecond(); - st.CommunicateThreaded(); - ctime = usecond() - start; - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopCommTime += ctime; - DhopComputeTime+=ptime; - - // First to enter, last to leave timing - st.CollateThreads(); + DhopComputeTime+=usecond(); DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - DhopComputeTime2-=usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<---------- - }); - } else { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<---------- - }); + DhopComputeTime2-=usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2+=usecond(); -#else - assert(0); -#endif - } template @@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; - - //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); @@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v); - }); - } else { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); - //double t2=usecond(); - //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; } /*CHANGE END*/ diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 57f4cb89..05d9a17e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -258,10 +258,10 @@ void ImprovedStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGauge //////////////////////// // Call the single hop //////////////////////// - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto B_v = B.View(); - auto Btilde_v = Btilde.View(); + autoView( U_v , U, CpuRead); + autoView( UUU_v , UUU, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); thread_for(sss,B.Grid()->oSites(),{ Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); }); @@ -386,10 +386,10 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel Compressor compressor; Stencil.HaloExchange(in, compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( sss, in.Grid()->oSites(),{ Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); }); @@ -403,11 +403,9 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } template @@ -417,7 +415,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); @@ -426,60 +423,30 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); - st.CommsMergeSHM(compressor); DhopFaceTime += usecond(); + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Removed explicit thread comms ////////////////////////////////////////////////////////////////////////////////////////////////////// DhopComputeTime -= usecond(); -#pragma omp parallel { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - - if (tid >= ncomms) { - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = len; - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } - } else { - st.CommunicateThreaded(); - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + // First to enter, last to leave timing DhopFaceTime -= usecond(); st.CommsMerge(compressor); @@ -487,28 +454,11 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopComputeTime2 -= usecond(); { - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } else { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2 += usecond(); -#else - assert(0); -#endif } @@ -528,19 +478,11 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le st.HaloExchange(in, compressor); DhopCommTime += usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); DhopComputeTime -= usecond(); - if (dag == DaggerYes) { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); - } else { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index f74c7a51..41b9170d 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -44,9 +44,9 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -84,9 +84,9 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator @@ -132,9 +132,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -174,9 +174,9 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm GridBase *grid = psi_i.Grid(); int Ls = this->Ls; int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -226,8 +226,8 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -286,8 +286,8 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; auto plee = & this->lee [0]; @@ -354,8 +354,8 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -410,8 +410,8 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; auto pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h new file mode 100644 index 00000000..788e02cf --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -0,0 +1,499 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#pragma once + +NAMESPACE_BEGIN(Grid); + +///////////////////////////////// +// Constructor and gauge import +///////////////////////////////// + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, + RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : Kernels(p), + _grid(&Fgrid), + _cbgrid(&Hgrid), + Stencil(&Fgrid, npoint, Even, directions, displacements,p), + StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even + StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd + mass(_mass), + Lebesgue(_grid), + LebesgueEvenOdd(_cbgrid), + Umu(&Fgrid), + UmuEven(&Hgrid), + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ + int vol4; + int LLs=1; + c1=_c1; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p) +{ + ImportGauge(_U); +} + +//////////////////////////////////////////////////////////// +// Momentum space propagator should be +// https://arxiv.org/pdf/hep-lat/9712010.pdf +// +// mom space action. +// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m +// +// must track through staggered flavour/spin reduction in literature to +// turn to free propagator for the one component chi field, a la page 4/5 +// of above link to implmement fourier based solver. +//////////////////////////////////////////////////////////// + +template +void NaiveStaggeredFermion::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); +} +template +void NaiveStaggeredFermion::ImportGauge(const GaugeField &_U) +{ + GaugeLinkField U(GaugeGrid()); + DoubledGaugeField _UUU(GaugeGrid()); + //////////////////////////////////////////////////////// + // Double Store should take two fields for Naik and one hop separately. + // Discard teh Naik as Naive + //////////////////////////////////////////////////////// + Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U ); + + //////////////////////////////////////////////////////// + // Apply scale factors to get the right fermion Kinetic term + // Could pass coeffs into the double store to save work. + // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) + //////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + U = PeekIndex(Umu, mu); + PokeIndex(Umu, U*( 0.5*c1/u0), mu ); + + U = PeekIndex(Umu, mu+4); + PokeIndex(Umu, U*(-0.5*c1/u0), mu+4); + + } + + CopyGaugeCheckerboards(); +} + +///////////////////////////// +// Implement the interface +///////////////////////////// + +template +void NaiveStaggeredFermion::M(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerNo); + axpy(out, mass, in, out); +} + +template +void NaiveStaggeredFermion::Mdag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerYes); + axpy(out, mass, in, out); +} + +template +void NaiveStaggeredFermion::Meooe(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } +} +template +void NaiveStaggeredFermion::MeooeDag(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } +} + +template +void NaiveStaggeredFermion::Mooee(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + typename FermionField::scalar_type scal(mass); + out = scal * in; +} + +template +void NaiveStaggeredFermion::MooeeDag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Mooee(in, out); +} + +template +void NaiveStaggeredFermion::MooeeInv(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + out = (1.0 / (mass)) * in; +} + +template +void NaiveStaggeredFermion::MooeeInvDag(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + MooeeInv(in, out); +} + +/////////////////////////////////// +// Internal +/////////////////////////////////// + +template +void NaiveStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, + GaugeField & mat, + const FermionField &A, const FermionField &B, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + Compressor compressor; + + FermionField Btilde(B.Grid()); + FermionField Atilde(B.Grid()); + Atilde = A; + + st.HaloExchange(B, compressor); + + for (int mu = 0; mu < Nd; mu++) { + + //////////////////////// + // Call the single hop + //////////////////////// + autoView( U_v , U, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); + thread_for(sss,B.Grid()->oSites(),{ + Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); + }); + + assert(0);// need to figure out the force interface with a blasted three link term. + + } +} + +template +void NaiveStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _grid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + mat.Checkerboard() = U.Checkerboard(); + + DerivInternal(Stencil, Umu, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Even); + assert(U.Checkerboard() == Odd); + mat.Checkerboard() = Odd; + + DerivInternal(StencilEven, UmuOdd, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Odd); + assert(U.Checkerboard() == Even); + mat.Checkerboard() = Even; + + DerivInternal(StencilOdd, UmuEven, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=2; + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); +} + +template +void NaiveStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ + DhopDir(in, out, dir, disp); +} +template +void NaiveStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} + +template +void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ + + Compressor compressor; + Stencil.HaloExchange(in, compressor); + autoView( Umu_v , Umu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); + // thread_for( sss, in.Grid()->oSites(),{ + // Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); + // }); + assert(0); +}; + + +template +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else + DhopInternalSerialComms(st,lo,U,in,out,dag); +} +template +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + Compressor compressor; + int len = U.Grid()->oSites(); + + DhopTotalTime -= usecond(); + + DhopFaceTime -= usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + DhopFaceTime += usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Removed explicit thread comms + ////////////////////////////////////////////////////////////////////////////////////////////////////// + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=0; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + // First to enter, last to leave timing + DhopFaceTime -= usecond(); + st.CommsMerge(compressor); + DhopFaceTime -= usecond(); + + DhopComputeTime2 -= usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime2 += usecond(); +} + +template +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + DhopTotalTime -= usecond(); + + DhopCommTime -= usecond(); + Compressor compressor; + st.HaloExchange(in, compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +}; + + //////////////////////////////////////////////////////////////// + // Reporting + //////////////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::Report(void) +{ + Coordinate latt = _grid->GlobalDimensions(); + RealD volume = 1; for(int mu=0;mu_Nprocessors; + RealD NN = _grid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" < +void NaiveStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + +//////////////////////////////////////////////////////// +// Conserved current - not yet implemented. +//////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu) +{ + assert(0); +} + +template +void NaiveStaggeredFermion::SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx) +{ + assert(0); + +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h index 1a13e73a..63fd2a2f 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h @@ -618,10 +618,10 @@ Author: paboyle NAMESPACE_BEGIN(Grid); template -void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { assert(0); @@ -680,12 +680,13 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge3 =(uint64_t)&UU[sU]( T ); + // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -702,9 +703,10 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { #ifdef AVX512 @@ -756,8 +758,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -841,9 +844,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -910,9 +913,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -181,8 +182,9 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); @@ -202,7 +205,7 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); - + } if ( dag ) { result()()(0) = - even_0 - odd_0; result()()(1) = - even_1 - odd_1; @@ -218,9 +221,10 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -253,8 +257,9 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); @@ -277,7 +283,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); - + } // Assume every site must be connected to at least one interior point. No 1^4 subvols. if ( dag ) { result()()(0) = - even_0 - odd_0; @@ -294,9 +300,10 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -329,8 +336,9 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); @@ -353,7 +362,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); - + } // Add sum of all exterior connected stencil legs if ( nmu ) { if ( dag ) { @@ -370,6 +379,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, } } +/* #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ @@ -385,7 +395,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ SiteSpinor *buf, int LLs, int sU, \ const FermionFieldView &in, FermionFieldView &out, int dag); \ - +*/ #undef LOAD_CHI NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index d301556c..141725a7 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else { \ chi_p = &buf[SE->_offset]; \ @@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else if ( st.same_node[Dir] ) { \ chi_p = &buf[SE->_offset]; \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ @@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid); if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } template @@ -78,10 +78,12 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag) { + SiteSpinor *buf, int sF, int sU, + const FermionFieldView &in, FermionFieldView &out, int dag) +{ const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; @@ -89,8 +91,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, int ptype; int skew; - for(int s=0;s::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); @@ -109,6 +114,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -120,9 +126,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, // Only contributions from interior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; SiteSpinor chi; @@ -131,8 +138,9 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & int ptype; int skew ; - for(int s=0;s::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); @@ -152,6 +161,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -164,9 +174,10 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & // Only contributions from exterior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; // SiteSpinor chi; @@ -176,8 +187,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & int nmu=0; int skew ; - for(int s=0;s::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); @@ -197,7 +210,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); - + } if ( nmu ) { if ( dag ) { out[sF] = out[sF] - Uchi; @@ -211,72 +224,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// - template -void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=1; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=0; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionFieldView &in, FermionFieldView &out, - int dag,int interior,int exterior) -{ - switch(Opt) { -#ifdef AVX512 - case OptInlineAsm: - if ( interior && exterior ) { - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else { - std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi assert(0); } +#define KERNEL_CALLNB(A,improved) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel:: template A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); + +#define ASM_CALL(A) \ + const uint64_t NN = Nsite*Ls; \ + thread_for( ss, NN, { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +template +void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + autoView( UUU_v , UUU, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} + if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} +#endif + } + assert(0 && " Kernel optimisation case not covered "); +} +template +void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + autoView( UUU_v , U, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} +#endif + } +} + + +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 36447153..df1bce7c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -98,32 +98,35 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) Coordinate lcoor; typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - for (int site = 0; site < lvol; site++) { - grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - peekLocalSite(Qx, CloverTerm, lcoor); - Qxinv = Zero(); - //if (csw!=0){ - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++){ - auto zz = Qx()(j, k)(a, b); - EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); - } - // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; - - EigenInvCloverOp = EigenCloverOp.inverse(); - //std::cout << EigenInvCloverOp << std::endl; - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++) - Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); - // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; - // } - pokeLocalSite(Qxinv, CloverTermInv, lcoor); + autoView(CTv,CloverTerm,CpuRead); + autoView(CTIv,CloverTermInv,CpuWrite); + for (int site = 0; site < lvol; site++) { + grid->LocalIndexToLocalCoor(site, lcoor); + EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + peekLocalSite(Qx, CTv, lcoor); + Qxinv = Zero(); + //if (csw!=0){ + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++){ + auto zz = Qx()(j, k)(a, b); + EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); + } + // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; + + EigenInvCloverOp = EigenCloverOp.inverse(); + //std::cout << EigenInvCloverOp << std::endl; + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++) + Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); + // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; + // } + pokeLocalSite(Qxinv, CTIv, lcoor); + } } // Separate the even and odd parts diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2a202a77..2cc308cc 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -580,16 +580,21 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh - for(int idx=0;idx<_grid->lSites();idx++){ - Coordinate lcoor(Nd); - Tcomplex cc; - // RealD sgn; - _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); - assert((double)real(cc)>=1.0); - assert(fabs((double)imag(cc))<=1.0e-15); - cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); + + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); + for(int idx=0;idx<_grid->lSites();idx++){ + Coordinate lcoor(Nd); + Tcomplex cc; + // RealD sgn; + _grid->LocalIndexToLocalCoor(idx,lcoor); + peekLocalSite(cc,cosha_v,lcoor); + assert((double)real(cc)>=1.0); + assert(fabs((double)imag(cc))<=1.0e-15); + cc = ScalComplex(::acosh(real(cc)),0.0); + pokeLocalSite(cc,a_v,lcoor); + } } Wea = ( exp( a) * abs(W) ); @@ -775,17 +780,20 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); for(int idx=0;idx<_grid->lSites();idx++){ Coordinate lcoor(Nd); Tcomplex cc; // RealD sgn; _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); + peekLocalSite(cc,cosha_v,lcoor); assert((double)real(cc)>=1.0); assert(fabs((double)imag(cc))<=1.0e-15); cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); - } + pokeLocalSite(cc,a_v,lcoor); + }} Wea = ( exp( a) * abs(W) ); Wema= ( exp(-a) * abs(W) ); diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 5267e0c1..f647bef8 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -67,7 +67,12 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, diag_mass = 4.0 + mass; } - + int vol4; + vol4=Fgrid.oSites(); + Stencil.BuildSurfaceList(1,vol4); + vol4=Hgrid.oSites(); + StencilEven.BuildSurfaceList(1,vol4); + StencilOdd.BuildSurfaceList(1,vol4); } template @@ -483,32 +488,7 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, conformable(_grid, q_in_1.Grid()); conformable(_grid, q_in_2.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - PropagatorField tmp1(_grid), tmp2(_grid); - q_out = Zero(); - - // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu). - // Inefficient comms method but not performance critical. - tmp1 = Cshift(q_in_1, mu, 1); - tmp2 = Cshift(q_in_2, mu, 1); - auto tmp1_v = tmp1.View(); - auto tmp2_v = tmp2.View(); - auto q_in_1_v=q_in_1.View(); - auto q_in_2_v=q_in_2.View(); - auto q_out_v = q_out.View(); - auto Umu_v = Umu.View(); - thread_for(sU, Umu.Grid()->oSites(),{ - Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], - q_in_2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU], - tmp2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - }); -#else -#endif + assert(0); } @@ -524,62 +504,7 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, { conformable(_grid, q_in.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - - // Lattice> ph(_grid), coor(_grid); - Complex i(0.0,1.0); - PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid); - unsigned int tshift = (mu == Tp) ? 1 : 0; - unsigned int LLt = GridDefaultLatt()[Tp]; - - q_out = Zero(); - LatticeInteger coords(_grid); - LatticeCoordinate(coords, Tp); - - // Need q(x + mu) and q(x - mu). - tmp = Cshift(q_in, mu, 1); - tmpFwd = tmp*lattice_cmplx; - tmp = lattice_cmplx*q_in; - tmpBwd = Cshift(tmp, mu, -1); - - auto coords_v = coords.View(); - auto tmpFwd_v = tmpFwd.View(); - auto tmpBwd_v = tmpBwd.View(); - auto Umu_v = Umu.View(); - auto q_out_v = q_out.View(); - - thread_for(sU, Umu.Grid()->oSites(), { - - // Compute the sequential conserved current insertion only if our simd - // object contains a timeslice we need. - vPredicate t_mask; - t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax)); - Integer timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - - // Repeat for backward direction. - t_mask() = ((coords_v[sU] >= (tmin + tshift)) && - (coords_v[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) - unsigned int t0 = 0; - if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); - - timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - }); -#else -#endif + assert(0); } NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 0ff72789..9ca29367 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid); // Generic implementation; move to different file? //////////////////////////////////////////// +/* accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) { -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads uint4 * chip_pun = (uint4 *)&chip; @@ -51,7 +52,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) #endif return; } - +*/ + #define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ if (SE->_is_local) { \ @@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); @@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else if ( st.same_node[Dir] ) { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ if (SE->_is_local || st.same_node[Dir] ) { \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); \ } \ - synchronise(); + acceleratorSynchronise(); #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ @@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) Recon(result, Uchi); \ nmu++; \ } \ - synchronise(); + acceleratorSynchronise(); #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ @@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ Recon(result, Uchi); @@ -126,7 +128,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); @@ -153,7 +155,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); @@ -181,7 +183,7 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); @@ -203,7 +205,7 @@ void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeField typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); calcHalfSpinor chi; // calcHalfSpinor *chi_p; @@ -239,7 +241,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi int ptype; int nmu=0; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); @@ -270,7 +272,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField int ptype; int nmu=0; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); @@ -300,7 +302,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField StencilEntry *SE; \ int ptype; \ const int Nsimd = SiteHalfSpinor::Nsimd(); \ - const int lane=SIMTlane(Nsimd); \ + const int lane=acceleratorSIMTlane(Nsimd); \ \ SE = st.GetEntry(ptype, dir, sF); \ GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ @@ -328,7 +330,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); @@ -346,30 +348,30 @@ template void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, std::vector &out) { - auto U_v = U.View(); - auto in_v = in.View(); - auto st_v = st.View(); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(st_v ,st,AcceleratorRead); - auto out_Xm = out[0].View(); - auto out_Ym = out[1].View(); - auto out_Zm = out[2].View(); - auto out_Tm = out[3].View(); - auto out_Xp = out[4].View(); - auto out_Yp = out[5].View(); - auto out_Zp = out[6].View(); - auto out_Tp = out[7].View(); - - accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ + autoView(out_Xm,out[0],AcceleratorWrite); + autoView(out_Ym,out[1],AcceleratorWrite); + autoView(out_Zm,out[2],AcceleratorWrite); + autoView(out_Tm,out[3],AcceleratorWrite); + autoView(out_Xp,out[4],AcceleratorWrite); + autoView(out_Yp,out[5],AcceleratorWrite); + autoView(out_Zp,out[6],AcceleratorWrite); + autoView(out_Tp,out[7],AcceleratorWrite); + auto CBp=st.CommBuf(); + accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{ int sU=sss/Ls; int sF =sss; - DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); - DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); - DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); - DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); - DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); - DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); - DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); - DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7); }); } @@ -381,17 +383,18 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S assert(dirdisp<=7); assert(dirdisp>=0); - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v ,U ,AcceleratorRead); + autoView(in_v ,in ,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st ,AcceleratorRead); + auto CBp=st.CommBuf(); #define LoopBody(Dir) \ - case Dir : \ - accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \ + case Dir : \ + accelerator_for(ss,Nsite,Simd::Nsimd(),{ \ for(int s=0;s::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif @@ -466,26 +469,26 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif @@ -493,5 +496,9 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField assert(0 && " Kernel optimisation case not covered "); } +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc new file mode 100644 index 00000000..c424cb2d --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc @@ -0,0 +1,36 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +const std::vector NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3}); +const std::vector NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1}); + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master new file mode 100644 index 00000000..75b75678 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master @@ -0,0 +1,37 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class NaiveStaggeredFermion; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index 330dcfa8..72a9eaf9 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -88,6 +88,7 @@ done CC_LIST=" \ ImprovedStaggeredFermion5DInstantiation \ ImprovedStaggeredFermionInstantiation \ + NaiveStaggeredFermionInstantiation \ StaggeredKernelsInstantiation " for impl in $STAG_IMPL_LIST diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index b9a5296d..9b7d5a60 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -86,9 +86,9 @@ public: // Move this elsewhere? FIXME static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W - auto U_v = U.View(); - auto W_v = W.View(); - thread_for( ss, U.Grid()->oSites(), { + autoView(U_v,U,AcceleratorWrite); + autoView(W_v,W,AcceleratorRead); + accelerator_for( ss, U.Grid()->oSites(), 1, { U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); }); } @@ -131,15 +131,14 @@ public: //static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); - auto U_v = U.View(); - auto P_v = P.View(); - thread_for(ss, P.Grid()->oSites(),{ + autoView(U_v,U,AcceleratorWrite); + autoView(P_v,P,AcceleratorRead); + accelerator_for(ss, P.Grid()->oSites(),1,{ for (int mu = 0; mu < Nd; mu++) { U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); } }); - - //auto end = std::chrono::high_resolution_clock::now(); + //auto end = std::chrono::high_resolution_clock::now(); // diff += end - start; // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; } diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 3be84480..5a5f9251 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -89,8 +89,8 @@ public: action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; - auto p_v = p.View(); - auto action_v = action.View(); + autoView( p_v , p, CpuRead); + autoView( action_v , action, CpuWrite); for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils @@ -146,8 +146,8 @@ public: for (int point = 0; point < npoint; point++) { - auto p_v = p.View(); - auto force_v = force.View(); + autoView( p_v , p, CpuRead); + autoView( force_v , force, CpuWrite); int permute_type; StencilEntry *SE; diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h index 459e1d0b..28a9fdae 100644 --- a/Grid/qcd/modules/Registration.h +++ b/Grid/qcd/modules/Registration.h @@ -80,10 +80,11 @@ static Registrar, static Registrar< ConjugateGradientModule, HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); -//static Registrar< BiCGSTABModule, -// HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); -//static Registrar< ConjugateResidualModule, -// HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); + +static Registrar< BiCGSTABModule, + HMC_SolverModuleFactory > __BiCGWFmodXMLInit("BiCGSTAB"); +static Registrar< ConjugateResidualModule, + HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); // add the staggered, scalar versions here diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index f4d00c72..0ff7fc25 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -49,7 +49,7 @@ public: private: const unsigned int smearingLevels; - Smear_Stout StoutSmearing; + Smear_Stout *StoutSmearing; std::vector SmearedSet; // Member functions @@ -72,7 +72,7 @@ private: previous_u = *ThinLinks; for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) { - StoutSmearing.smear(SmearedSet[smearLvl], previous_u); + StoutSmearing->smear(SmearedSet[smearLvl], previous_u); previous_u = SmearedSet[smearLvl]; // For debug purposes @@ -93,7 +93,7 @@ private: GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid); - StoutSmearing.BaseSmear(C, GaugeK); + StoutSmearing->BaseSmear(C, GaugeK); SigmaK = Zero(); iLambda = Zero(); @@ -107,7 +107,7 @@ private: pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu); } - StoutSmearing.derivative(SigmaK, iLambda, + StoutSmearing->derivative(SigmaK, iLambda, GaugeK); // derivative of SmearBase return SigmaK; } @@ -144,14 +144,14 @@ private: // Exponential iQ2 = iQ * iQ; iQ3 = iQ * iQ2; - StoutSmearing.set_uw(u, w, iQ2, iQ3); - StoutSmearing.set_fj(f0, f1, f2, u, w); + StoutSmearing->set_uw(u, w, iQ2, iQ3); + StoutSmearing->set_fj(f0, f1, f2, u, w); e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2; // Getting B1, B2, Gamma and Lambda // simplify this part, reduntant calculations in set_fj - xi0 = StoutSmearing.func_xi0(w); - xi1 = StoutSmearing.func_xi1(w); + xi0 = StoutSmearing->func_xi0(w); + xi1 = StoutSmearing->func_xi1(w); u2 = u * u; w2 = w * w; cosw = cos(w); @@ -219,7 +219,7 @@ public: /* Standard constructor */ SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, Smear_Stout& Stout) - : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) + : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL) { for (unsigned int i = 0; i < smearingLevels; ++i) SmearedSet.push_back(*(new GaugeField(UGrid))); @@ -227,7 +227,7 @@ public: /*! For just thin links */ SmearedConfiguration() - : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} + : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {} // attach the smeared routines to the thin links U and fill the smeared set void set_Field(GaugeField &U) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index c7c7d329..b63d8571 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -185,13 +185,14 @@ void A2Autils::MesonField(TensorType &mat, for(int i=0;i::MesonField(TensorType &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::PionFieldXX(Eigen::Tensor &mat, for(int i=0;i::PionFieldXX(Eigen::Tensor &mat, } for(int j=0;j::PionFieldWVmom(Eigen::Tensor &mat, for(int i=0;i::PionFieldWVmom(Eigen::Tensor &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, for(int i=0;i::AslashField(TensorType &mat, for ( int m=0;m::ContractWWVV(std::vector &WWVV, for(int d_o=0;d_o::ContractWWVV(std::vector &WWVV, thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o::OuterProductWWVV(PropagatorField &WWVV, const vobj &rhs, const int Ns, const int ss) { - auto WWVV_v = WWVV.View(); + autoView(WWVV_v,WWVV,CpuWrite); for (int s1 = 0; s1 < Ns; s1++){ for (int s2 = 0; s2 < Ns; s2++){ WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); @@ -1122,10 +1121,10 @@ void A2Autils::ContractFourQuarkColourDiagonal(const PropagatorField &WWV GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + autoView(WWVV0_v , WWVV0,CpuRead); + autoView(WWVV1_v , WWVV1,CpuRead); + autoView(O_trtr_v, O_trtr,CpuWrite); + autoView(O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ typedef typename ComplexField::vector_object vobj; @@ -1166,10 +1165,10 @@ void A2Autils::ContractFourQuarkColourMix(const PropagatorField &WWVV0, GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + autoView( WWVV0_v , WWVV0,CpuRead); + autoView( WWVV1_v , WWVV1,CpuRead); + autoView( O_trtr_v, O_trtr,CpuWrite); + autoView( O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 0d93fa9e..c7a72812 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -350,11 +350,11 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); GridBase *grid = q1_left.Grid(); - - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); Real bytes =0.; bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); @@ -989,10 +989,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_loop = qq_loop.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_loop , qq_loop, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_loop = vq_loop[ss]; @@ -1029,13 +1029,13 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_ti = qq_ti.View(); - auto vq_tf = qq_tf.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); - - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + autoView( vcorr , stn_corr, CpuWrite); + autoView( vq_ti , qq_ti, CpuRead); + autoView( vq_tf , qq_tf, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + thread_for(ss,grid->oSites(),{ auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; auto Dd_tf = vd_tf[ss]; diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 56f8f164..1e016e4e 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -47,8 +47,8 @@ void axpibg5x(Lattice &z,const Lattice &x,Coeff a,Coeff b) GridBase *grid=x.Grid(); Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + autoView(x_v, x, AcceleratorRead); + autoView(z_v, z, AcceleratorWrite); accelerator_for( ss, x_v.size(),vobj::Nsimd(), { auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); coalescedWrite(z_v[ss],tmp); @@ -63,9 +63,9 @@ void axpby_ssp(Lattice &z, Coeff a,const Lattice &x,Coeff b,const La conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); // FIXME -- need a new class of accelerator_loop to implement this // uint64_t nloop = grid->oSites()/Ls; @@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice &z,Coeff a,const Lattice &x,Coeff b,con conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -189,8 +189,8 @@ void G5R5(Lattice &z,const Lattice &x) conformable(x,z); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -222,8 +222,8 @@ void G5C(Lattice> &z, const LatticeoSites(),CComplex::Nsimd(), { for(int n = 0; n < nb; ++n) { diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ad80d00..0cc0cc1a 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -222,11 +222,11 @@ public: conformable(subgroup, Determinant); int i0, i1; su2SubGroupIndex(i0, i1, su2_index); - auto subgroup_v = subgroup.View(); - auto source_v = source.View(); - auto Determinant_v = Determinant.View(); - thread_for(ss, grid->oSites(), { + autoView( subgroup_v , subgroup,AcceleratorWrite); + autoView( source_v , source,AcceleratorRead); + autoView( Determinant_v , Determinant,AcceleratorWrite); + accelerator_for(ss, grid->oSites(), 1, { subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0); subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1); @@ -257,15 +257,16 @@ public: su2SubGroupIndex(i0, i1, su2_index); dest = 1.0; // start out with identity - auto dest_v = dest.View(); - auto subgroup_v = subgroup.View(); - thread_for(ss, grid->oSites(), + autoView( dest_v , dest, AcceleratorWrite); + autoView( subgroup_v, subgroup, AcceleratorRead); + accelerator_for(ss, grid->oSites(),1, { dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1); dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0); dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1); }); + } /////////////////////////////////////////////// @@ -608,8 +609,8 @@ public: // reunitarise?? template - static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, - double scale = 1.0) { + static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) + { GridBase *grid = out.Grid(); typedef typename LatticeMatrixType::vector_type vector_type; @@ -618,8 +619,7 @@ public: typedef iSinglet vTComplexType; typedef Lattice LatticeComplexType; - typedef typename GridTypeMapper< - typename LatticeMatrixType::vector_object>::scalar_object MatrixType; + typedef typename GridTypeMapper::scalar_object MatrixType; LatticeComplexType ca(grid); LatticeMatrixType lie(grid); @@ -629,6 +629,7 @@ public: MatrixType ta; lie = Zero(); + for (int a = 0; a < AdjointDimension; a++) { random(pRNG, ca); @@ -640,6 +641,7 @@ public: la = ci * ca * ta; lie = lie + la; // e^{i la ta} + } taExp(lie, out); } diff --git a/Grid/serialisation/Serialisation.h b/Grid/serialisation/Serialisation.h index c95226b1..e14120af 100644 --- a/Grid/serialisation/Serialisation.h +++ b/Grid/serialisation/Serialisation.h @@ -36,7 +36,7 @@ Author: Peter Boyle #include "BinaryIO.h" #include "TextIO.h" #include "XmlIO.h" -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) #include "JSON_IO.h" #endif diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 4584fb36..b9c6a81b 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -32,7 +32,12 @@ Author: Peter Boyle */ //---------------------------------------------------------------------- +#ifdef GRID_CUDA #include +#endif +#ifdef GRID_HIP +#include +#endif namespace Grid { @@ -142,7 +147,7 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT f = __half2float(h); #else //f = __half2float(h); @@ -156,7 +161,7 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index bc8cd2fd..37aee2ed 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -31,7 +31,7 @@ directory #ifndef GRID_SIMD_H #define GRID_SIMD_H -#ifdef GRID_NVCC +#if defined(GRID_CUDA) || defined(GRID_HIP) #include #endif @@ -65,7 +65,7 @@ typedef RealD Real; typedef RealF Real; #endif -#ifdef GRID_NVCC +#if defined(GRID_CUDA) || defined(GRID_HIP) typedef thrust::complex ComplexF; typedef thrust::complex ComplexD; typedef thrust::complex Complex; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 37b866cb..3b9ae08e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -67,7 +67,8 @@ void Gather_plane_simple_table (Vector >& table,const Lattice { int num=table.size(); std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(); + + auto rhs_v = rhs.View(AcceleratorRead); accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; compressed_t tmp_c; @@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector >& table,const Lattice compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); coalescedWrite(buffer[off+o],tmp_c); }); + rhs_v.ViewClose(); // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table } @@ -94,7 +96,7 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic int num=table.size()/2; int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; @@ -104,10 +106,11 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic so+tp[2*j+1].second, type); }); + rhs_v.ViewClose(); } struct StencilEntry { -#ifdef GRID_NVCC +#ifdef GRID_CUDA uint64_t _byte_offset; // 8 bytes uint32_t _offset; // 4 bytes #else @@ -122,7 +125,7 @@ struct StencilEntry { // Could pack to 8 + 4 + 4 = 128 bit and use template -class CartesianStencilView { +class CartesianStencilAccelerator { public: typedef AcceleratorVector StencilVector; @@ -130,14 +133,15 @@ class CartesianStencilView { //////////////////////////////////////// // Basic Grid and stencil info //////////////////////////////////////// - int _checkerboard; - int _npoints; // Move to template param? + int _checkerboard; + int _npoints; // Move to template param? + int _osites; StencilVector _directions; StencilVector _distances; StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; - Coordinate _simd_layout; + Coordinate _simd_layout; Parameters parameters; StencilEntry* _entries_p; cobj* u_recv_buf_p; @@ -175,13 +179,43 @@ class CartesianStencilView { { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } - }; + +template +class CartesianStencilView : public CartesianStencilAccelerator +{ + private: + int *closed; + StencilEntry *cpu_ptr; + ViewMode mode; + public: + // default copy constructor + CartesianStencilView (const CartesianStencilView &refer_to_me) = default; + + CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode _mode) + : CartesianStencilAccelerator(refer_to_me), + cpu_ptr(this->_entries_p), + mode(_mode) + { + this->_entries_p =(StencilEntry *) + MemoryManager::ViewOpen(this->_entries_p, + this->_npoints*this->_osites*sizeof(StencilEntry), + mode, + AdviseDefault); + } + + void ViewClose(void) + { + MemoryManager::ViewClose(this->cpu_ptr,this->mode); + } + +}; + //////////////////////////////////////// // The Stencil Class itself //////////////////////////////////////// template -class CartesianStencil : public CartesianStencilView { // Stencil runs along coordinate axes only; NO diagonal fill in. +class CartesianStencil : public CartesianStencilAccelerator { // Stencil runs along coordinate axes only; NO diagonal fill in. public: typedef typename cobj::vector_type vector_type; @@ -226,8 +260,8 @@ public: // Generalise as required later if needed //////////////////////////////////////////////////////////////////////// - View_type View(void) const { - View_type accessor(*( (View_type *) this)); + View_type View(ViewMode mode) const { + View_type accessor(*( (View_type *) this),mode); return accessor; } @@ -662,9 +696,9 @@ public: _unified_buffer_size=0; surface_list.resize(0); - int osites = _grid->oSites(); + this->_osites = _grid->oSites(); - _entries.resize(this->_npoints* osites); + _entries.resize(this->_npoints* this->_osites); this->_entries_p = &_entries[0]; for(int ii=0;ii NAMESPACE_BEGIN(Grid); -//accelerator_inline void SIMTsynchronise(void) -accelerator_inline void synchronise(void) -{ -#ifdef __CUDA_ARCH__ -// __syncthreads(); - __syncwarp(); -#endif - return; -} -#ifndef __CUDA_ARCH__ +#ifndef GRID_SIMT ////////////////////////////////////////// // Trivial mapping of vectors on host ////////////////////////////////////////// -accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific - template accelerator_inline vobj coalescedRead(const vobj & __restrict__ vec,int lane=0) { @@ -66,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0) { - // vstream(vec, extracted); vec = extracted; } template accelerator_inline @@ -75,25 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ vstream(vec, extracted); } #else -accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// template accelerator_inline -typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd())) +typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd())) { return extractLane(lane,vec); } template accelerator_inline -typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd())) +typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd())) { int mask = vobj::Nsimd() >> (ptype + 1); int plane= doperm ? lane ^ mask : lane; return extractLane(plane,vec); } template accelerator_inline -void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd())) +void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd())) { insertLane(lane,vec,extracted); } diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index dbcbae8d..36becc49 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -59,6 +59,20 @@ class GridTensorBase {}; using DoublePrecision2= typename Traits::DoublePrecision2; \ static constexpr int TensorLevel = Traits::TensorLevel +/////////////////////////////////////////////////////////// +// Allows to turn scalar>>> back to double. +/////////////////////////////////////////////////////////// +template +accelerator_inline typename std::enable_if::value, T>::type +TensorRemove(T arg) { + return arg; +} +template +accelerator_inline auto TensorRemove(iScalar arg) + -> decltype(TensorRemove(arg._internal)) { + return TensorRemove(arg._internal); +} + template class iScalar { public: @@ -135,9 +149,10 @@ public: operator ComplexD() const { return (TensorRemove(_internal)); } + // instantiation of "Grid::iScalar::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, =0, =0U]" template = 0,IfNotSimd = 0> accelerator_inline operator RealD() const { - return TensorRemove(_internal); + return (RealD) TensorRemove(_internal); } template = 0, IfNotSimd = 0> accelerator_inline operator Integer() const { @@ -169,20 +184,6 @@ public: strong_inline scalar_type * end() { return begin() + Traits::count; } }; -/////////////////////////////////////////////////////////// -// Allows to turn scalar>>> back to double. -/////////////////////////////////////////////////////////// -template -accelerator_inline typename std::enable_if::value, T>::type -TensorRemove(T arg) { - return arg; -} -template -accelerator_inline auto TensorRemove(iScalar arg) - -> decltype(TensorRemove(arg._internal)) { - return TensorRemove(arg._internal); -} - template class iVector { public: diff --git a/Grid/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h index 11d37f9c..0a1d6389 100644 --- a/Grid/tensors/Tensor_exp.h +++ b/Grid/tensors/Tensor_exp.h @@ -55,7 +55,7 @@ template accelerator_inline iVector Exponentiate(c // Specialisation: Cayley-Hamilton exponential for SU(3) -#ifndef GRID_NVCC +#ifndef GRID_CUDA template::TensorLevel == 0>::type * =nullptr> accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) { diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc new file mode 100644 index 00000000..2c4ad9df --- /dev/null +++ b/Grid/threads/Accelerator.cc @@ -0,0 +1,207 @@ +#include + +NAMESPACE_BEGIN(Grid); +uint32_t accelerator_threads=2; +uint32_t acceleratorThreads(void) {return accelerator_threads;}; +void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; + +#ifdef GRID_CUDA +cudaDeviceProp *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + cudaGetDeviceCount(&nDevices); + gpu_props = new cudaDeviceProp[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + size_t totalDeviceMem=0; + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + cudaGetDeviceProperties(&gpu_props[i], i); + cudaDeviceProp prop; + prop = gpu_props[i]; + totalDeviceMem = prop.totalGlobalMem; + if ( world_rank == 0) { + printf("AcceleratorCudaInit: ========================\n"); + printf("AcceleratorCudaInit: Device Number : %d\n", i); + printf("AcceleratorCudaInit: ========================\n"); + printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name); + + GPU_PROP_FMT(totalGlobalMem,"%lld"); + GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } + MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours +#undef GPU_PROP_FMT +#undef GPU_PROP + +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n"); + cudaSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); +} +#endif + +#ifdef GRID_HIP +hipDeviceProp_t *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + hipGetDeviceCount(&nDevices); + gpu_props = new hipDeviceProp_t[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + hipGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + hipDeviceProp_t prop; + prop = gpu_props[i]; + printf("AcceleratorHipInit: ========================\n"); + printf("AcceleratorHipInit: Device Number : %d\n", i); + printf("AcceleratorHipInit: ========================\n"); + printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); + + // GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } +#undef GPU_PROP_FMT +#undef GPU_PROP +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); + hipSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); +} +#endif + + +#ifdef GRID_SYCL + +cl::sycl::queue *theGridAccelerator; + +void acceleratorInit(void) +{ + int nDevices = 1; + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theGridAccelerator = new sycl::queue (selectedDevice); + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + auto devices = cl::sycl::device::get_devices(); + for(int d = 0;d().c_str()); + +#define GPU_PROP_FMT(prop,FMT) \ + printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); + +#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); + // GPU_PROP_STR(device_type); + /* + GPU_PROP(max_compute_units); + GPU_PROP(native_vector_width_char); + GPU_PROP(native_vector_width_short); + GPU_PROP(native_vector_width_int); + GPU_PROP(native_vector_width_long); + GPU_PROP(native_vector_width_float); + GPU_PROP(native_vector_width_double); + GPU_PROP(native_vector_width_half); + GPU_PROP(address_bits); + GPU_PROP(half_fp_config); + GPU_PROP(single_fp_config); + */ + // GPU_PROP(double_fp_config); + GPU_PROP(global_mem_size); + + } + if ( world_rank == 0 ) { + auto name = theGridAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); + printf("AcceleratorSyclInit: ================================================\n"); + } +} +#endif + +#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) +void acceleratorInit(void){} +#endif + +NAMESPACE_END(Grid); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h new file mode 100644 index 00000000..74a3ea22 --- /dev/null +++ b/Grid/threads/Accelerator.h @@ -0,0 +1,426 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/Accelerator.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#include + +#ifdef HAVE_MALLOC_MALLOC_H +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif +#ifdef HAVE_MM_MALLOC_H +#include +#endif + +NAMESPACE_BEGIN(Grid); + +////////////////////////////////////////////////////////////////////////////////// +// Accelerator primitives; fall back to threading if not CUDA or SYCL +////////////////////////////////////////////////////////////////////////////////// +// +// Function attributes +// +// accelerator +// accelerator_inline +// +// Parallel looping +// +// accelerator_for +// accelerator_forNB +// uint32_t accelerator_barrier(); // device synchronise +// +// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd +// +// uint32_t acceleratorThreads(void); +// void acceleratorThreads(uint32_t); +// +// Warp control and info: +// +// acceleratorInit; +// void acceleratorSynchronise(void); // synch warp etc.. +// int acceleratorSIMTlane(int Nsimd); +// +// Memory management: +// +// void *acceleratorAllocShared(size_t bytes); +// void acceleratorFreeShared(void *ptr); +// +// void *acceleratorAllocDevice(size_t bytes); +// void acceleratorFreeDevice(void *ptr); +// +// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes); +// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes); +// +////////////////////////////////////////////////////////////////////////////////// + +uint32_t acceleratorThreads(void); +void acceleratorThreads(uint32_t); +void acceleratorInit(void); + +////////////////////////////////////////////// +// CUDA acceleration +////////////////////////////////////////////// + +#ifdef GRID_CUDA + +#ifdef __CUDA_ARCH__ +#define GRID_SIMT +#endif + +#define accelerator __host__ __device__ +#define accelerator_inline __host__ __device__ inline + +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return threadIdx.z; +#else + return 0; +#endif +} // CUDA specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + int nt=acceleratorThreads(); \ + dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } + +template __global__ +void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) +{ + uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; + uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; + uint64_t z = threadIdx.z; + if ( (x < num1) && (y +#include +NAMESPACE_BEGIN(Grid); + +extern cl::sycl::queue *theGridAccelerator; + +#ifdef __SYCL_DEVICE_ONLY__ +#define GRID_SIMT +#endif + +#define accelerator +#define accelerator_inline strong_inline + +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; +#else + return 0; +#endif +} // SYCL specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ + unsigned long nt=acceleratorThreads(); \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1,unum2,nsimd}; \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) mutable { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { __VA_ARGS__ }; \ + }); \ + }); + +#define accelerator_barrier(dummy) theGridAccelerator->wait(); + +inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; +inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} + +#endif + +////////////////////////////////////////////// +// HIP acceleration +////////////////////////////////////////////// +#ifdef GRID_HIP +NAMESPACE_END(Grid); +#include +NAMESPACE_BEGIN(Grid); + +#ifdef __HIP_DEVICE_COMPILE__ +#define GRID_SIMT +#endif + +#define accelerator __host__ __device__ +#define accelerator_inline __host__ __device__ inline + +/*These routines define mapping from thread grid to loop & vector lane indexing */ +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return hipThreadIdx_z; +#else + return 0; +#endif +} // HIP specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ + { __VA_ARGS__;} \ + }; \ + int nt=acceleratorThreads(); \ + dim3 hip_threads(nt,1,nsimd); \ + dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ + hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ + 0,0, \ + num1,num2,nsimd,lambda); \ + } + +template __global__ +void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) +{ + uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x; + uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y; + uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z; + if ( (x < numx) && (y /* END LEGAL */ #pragma once -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#define MIN(x,y) ((x)>(y)?(y):(x)) -#endif - -#define strong_inline __attribute__((always_inline)) inline -#define UNROLL _Pragma("unroll") - -////////////////////////////////////////////////////////////////////////////////// -// New primitives; explicit host thread calls, and accelerator data parallel calls -////////////////////////////////////////////////////////////////////////////////// - -#ifdef _OPENMP -#define GRID_OMP -#include -#endif - -#ifdef GRID_OMP -#define DO_PRAGMA_(x) _Pragma (#x) -#define DO_PRAGMA(x) DO_PRAGMA_(x) -#define thread_num(a) omp_get_thread_num() -#define thread_max(a) omp_get_max_threads() -#else -#define DO_PRAGMA_(x) -#define DO_PRAGMA(x) -#define thread_num(a) (0) -#define thread_max(a) (1) -#endif - -#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i __global__ -void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda) -{ - uint64_t isite = threadIdx.y; - uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x; - if ( (osite >>(nsimd,num,lambda); \ - } - -// Copy the for_each_n style ; Non-blocking variant (default -#define accelerator_for( iterator, num, nsimd, ... ) \ - accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ - accelerator_barrier(dummy); - -#else - -#define accelerator -#define accelerator_inline strong_inline -#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); -#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); -#define accelerator_barrier(dummy) - -#endif +#include +#include diff --git a/Grid/threads/ThreadReduction.h b/Grid/threads/ThreadReduction.h new file mode 100644 index 00000000..f0d24d50 --- /dev/null +++ b/Grid/threads/ThreadReduction.h @@ -0,0 +1,127 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/ThreadReduction.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +// Introduce a class to gain deterministic bit reproducible reduction. +// make static; perhaps just a namespace is required. +NAMESPACE_BEGIN(Grid); + +class GridThread { +public: + static int _threads; + static int _hyperthreads; + static int _cores; + + static void SetCores(int cr) { +#ifdef GRID_OMP + _cores = cr; +#else + _cores = 1; +#endif + } + static void SetThreads(int thr) { +#ifdef GRID_OMP + _threads = MIN(thr,omp_get_max_threads()) ; + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static void SetMaxThreads(void) { +#ifdef GRID_OMP + _threads = omp_get_max_threads(); + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; }; + static int GetCores(void) { return _cores; }; + static int GetThreads(void) { return _threads; }; + static int SumArraySize(void) {return _threads;}; + + static void GetWork(int nwork, int me, int & mywork, int & myoff){ + GetWork(nwork,me,mywork,myoff,_threads); + } + static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){ + int basework = nwork/units; + int backfill = units-(nwork%units); + if ( me >= units ) { + mywork = myoff = 0; + } else { + mywork = (nwork+me)/units; + myoff = basework * me; + if ( me > backfill ) + myoff+= (me-backfill); + } + return; + }; + + static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){ + me = ThreadBarrier(); + GetWork(nwork,me,mywork,myoff); + }; + + static int ThreadBarrier(void) { +#ifdef GRID_OMP +#pragma omp barrier + return omp_get_thread_num(); +#else + return 0; +#endif + }; + + template static void ThreadSum( std::vector &sum_array,obj &val,int me){ + sum_array[me] = val; + val=Zero(); + ThreadBarrier(); + for(int i=0;i<_threads;i++) val+= sum_array[i]; + ThreadBarrier(); + } + + static void bcopy(const void *src, void *dst, size_t len) { +#ifdef GRID_OMP +#pragma omp parallel + { + const char *c_src =(char *) src; + char *c_dest=(char *) dst; + int me,mywork,myoff; + GridThread::GetWorkBarrier(len,me, mywork,myoff); + bcopy(&c_src[myoff],&c_dest[myoff],mywork); + } +#else + bcopy(src,dst,len); +#endif + } + + +}; + +NAMESPACE_END(Grid); + diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 29cae060..a9fa13ea 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -28,101 +28,47 @@ Author: paboyle /* END LEGAL */ #pragma once +#ifndef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) +#define MIN(x,y) ((x)>(y)?(y):(x)) +#endif -// Introduce a class to gain deterministic bit reproducible reduction. -// make static; perhaps just a namespace is required. -NAMESPACE_BEGIN(Grid); +#define strong_inline __attribute__((always_inline)) inline +#define UNROLL _Pragma("unroll") -class GridThread { -public: - static int _threads; - static int _hyperthreads; - static int _cores; +////////////////////////////////////////////////////////////////////////////////// +// New primitives; explicit host thread calls, and accelerator data parallel calls +////////////////////////////////////////////////////////////////////////////////// + +#ifdef _OPENMP +#define GRID_OMP +#include +#endif - static void SetCores(int cr) { #ifdef GRID_OMP - _cores = cr; +#define DO_PRAGMA_(x) _Pragma (#x) +#define DO_PRAGMA(x) DO_PRAGMA_(x) +#define thread_num(a) omp_get_thread_num() +#define thread_max(a) omp_get_max_threads() #else - _cores = 1; +#define DO_PRAGMA_(x) +#define DO_PRAGMA(x) +#define thread_num(a) (0) +#define thread_max(a) (1) #endif - } - static void SetThreads(int thr) { -#ifdef GRID_OMP - _threads = MIN(thr,omp_get_max_threads()) ; - omp_set_num_threads(_threads); -#else - _threads = 1; -#endif - }; - static void SetMaxThreads(void) { -#ifdef GRID_OMP - _threads = omp_get_max_threads(); - omp_set_num_threads(_threads); -#else - _threads = 1; -#endif - }; - static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; }; - static int GetCores(void) { return _cores; }; - static int GetThreads(void) { return _threads; }; - static int SumArraySize(void) {return _threads;}; - static void GetWork(int nwork, int me, int & mywork, int & myoff){ - GetWork(nwork,me,mywork,myoff,_threads); - } - static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){ - int basework = nwork/units; - int backfill = units-(nwork%units); - if ( me >= units ) { - mywork = myoff = 0; - } else { - mywork = (nwork+me)/units; - myoff = basework * me; - if ( me > backfill ) - myoff+= (me-backfill); - } - return; - }; - - static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){ - me = ThreadBarrier(); - GetWork(nwork,me,mywork,myoff); - }; - - static int ThreadBarrier(void) { -#ifdef GRID_OMP -#pragma omp barrier - return omp_get_thread_num(); -#else - return 0; -#endif - }; - - template static void ThreadSum( std::vector &sum_array,obj &val,int me){ - sum_array[me] = val; - val=Zero(); - ThreadBarrier(); - for(int i=0;i<_threads;i++) val+= sum_array[i]; - ThreadBarrier(); - } - - static void bcopy(const void *src, void *dst, size_t len) { -#ifdef GRID_OMP -#pragma omp parallel - { - const char *c_src =(char *) src; - char *c_dest=(char *) dst; - int me,mywork,myoff; - GridThread::GetWorkBarrier(len,me, mywork,myoff); - bcopy(&c_src[myoff],&c_dest[myoff],mywork); - } -#else - bcopy(src,dst,len); -#endif - } - - -}; - -NAMESPACE_END(Grid); +#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i=0); assert(sz<=MaxEntries); +#endif _size = sz; } accelerator_inline void resize(size_type sz,const value &val) { - assert(sz>=0); - assert(sz<=MaxEntries); - _size = sz; + resize(sz); for(int s=0;s ©me) { diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 1b672141..656e29a9 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -73,8 +73,6 @@ feenableexcept (unsigned int excepts) } #endif -uint32_t gpu_threads=8; - NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// @@ -192,16 +190,12 @@ void GridParseLayout(char **argv,int argc, assert(ompthreads.size()==1); GridThread::SetThreads(ompthreads[0]); } - if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){ + if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){ std::vector gputhreads(0); -#ifndef GRID_NVCC - std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was" - << " not compiled with GPU support" << std::endl; -#endif - arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads"); + arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads"); GridCmdOptionIntVector(arg,gputhreads); assert(gputhreads.size()==1); - gpu_threads=gputhreads[0]; + acceleratorThreads(gputhreads[0]); } if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ @@ -241,8 +235,6 @@ static int Grid_is_initialised; ///////////////////////////////////////////////////////// void GridBanner(void) { - static int printed =0; - if( !printed ) { std::cout < -#ifdef GRID_NVCC +#ifdef GRID_CUDA #define CUDA_PROFILE #endif @@ -129,8 +129,8 @@ int main (int argc, char ** argv) LatticeGaugeField Umu5d(FGrid); std::vector U(4,FGrid); { - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); + autoView( Umu5d_v, Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s & latt4, int Ls, int threads,int report ) LatticeGaugeField Umu5d(FGrid); // replicate across fifth dimension - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); - for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s > &mat, for(int b=0;b > &mat, for(int b=0;b > &mat int ss= so+n*stride+b; for(int i=0;i > &m for(int i=0;i > &m // Trigger unroll for ( int m=0;m +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + + typedef typename ImprovedStaggeredFermionF::FermionField FermionField; + typename ImprovedStaggeredFermionF::ImplParams params; + + FermionField src (&Grid); random(pRNG,src); + FermionField result(&Grid); result=Zero(); + FermionField ref(&Grid); ref=Zero(); + FermionField tmp(&Grid); tmp=Zero(); + FermionField err(&Grid); tmp=Zero(); + LatticeGaugeFieldF Umu(&Grid); random(pRNG,Umu); + std::vector U(4,&Grid); + + double volume=1; + for(int mu=0;mu(Umu,U[nn],nn); + } +#endif + + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); + + std::cout< using namespace std; using namespace Grid; - ; template struct scal { @@ -51,6 +50,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -100,6 +100,8 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o,result_o_2); + MemoryManager::Print(); + LatticeFermionD diff_o(FrbGrid); RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); @@ -130,7 +132,9 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " CG checksums "<oSites();i++){ auto SE = gStencil.GetEntry(0,i); - auto check = Check.View(); - auto foo = Foo.View(); - + autoView(check, Check, CpuWrite); + autoView( foo, Foo, CpuRead); // Encapsulate in a general wrapper check[i] = foo[SE->_offset]; auto tmp=check[i]; if (SE->_permute & 0x1 ) { permute(check[i],tmp,0); tmp=check[i];} @@ -147,8 +146,8 @@ int main(int argc, char ** argv) }}}} if (nrm > 1.0e-4) { - auto check = Check.View(); - auto bar = Bar.View(); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i_is_local && SE->_permute ) permute(check[i],foo[SE->_offset],permute_type); else if (SE->_is_local) @@ -151,8 +151,8 @@ int main(int argc, char ** argv) { }}}} if (nrm > 1.0e-4) { - auto check = Check.View(); - auto bar = Bar.View(); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(ocheck[i],efoo[SE->_offset],permute_type); else if (SE->_is_local) @@ -226,8 +226,8 @@ int main(int argc, char ** argv) { SE = OStencil.GetEntry(permute_type,0,i); // std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(echeck[i],ofoo[SE->_offset],permute_type); else if (SE->_is_local) diff --git a/tests/core/Test_cshift_red_black.cc b/tests/core/Test_cshift_red_black.cc index 34325072..4fdd5fc0 100644 --- a/tests/core/Test_cshift_red_black.cc +++ b/tests/core/Test_cshift_red_black.cc @@ -82,7 +82,7 @@ int main (int argc, char ** argv) pickCheckerboard(Odd,Uo,U); // std::cout<oSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s U(4,FGrid); for(int mu=0;mu +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + + typedef typename NaiveStaggeredFermionR::FermionField FermionField; + typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; + typename NaiveStaggeredFermionR::ImplParams params; + + FermionField src (&Grid); random(pRNG,src); + FermionField result(&Grid); result=Zero(); + FermionField ref(&Grid); ref=Zero(); + FermionField tmp(&Grid); tmp=Zero(); + FermionField err(&Grid); tmp=Zero(); + FermionField phi (&Grid); random(pRNG,phi); + FermionField chi (&Grid); random(pRNG,chi); + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + std::vector U(4,&Grid); + + + double volume=1; + for(int mu=0;mu(Umu,mu); + /* Debug force unit + U[mu] = 1.0; + PokeIndex(Umu,U[mu],mu); + */ + } + + ref = Zero(); + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD u0=1.0; + + { // Simple improved staggered implementation + ref = Zero(); + RealD c1tad = 0.5*c1/u0; + + Lattice > coor(&Grid); + + Lattice > x(&Grid); LatticeCoordinate(x,0); + Lattice > y(&Grid); LatticeCoordinate(y,1); + Lattice > z(&Grid); LatticeCoordinate(z,2); + Lattice > t(&Grid); LatticeCoordinate(t,3); + + Lattice > lin_z(&Grid); lin_z=x+y; + Lattice > lin_t(&Grid); lin_t=x+y+z; + + for(int mu=0;mu * = < chi | Deo^dag| phi> "< HermOpEO(Ds); + HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); + HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + + HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); + HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + + pDce = innerProduct(phi_e,dchi_e); + pDco = innerProduct(phi_o,dchi_o); + cDpe = innerProduct(chi_e,dphi_e); + cDpo = innerProduct(chi_o,dphi_o); + + std::cout< U(4,FGrid); { - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); + autoView( Umu5d_v , Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;soSites(),{ uint64_t ss= sss*Ls; typedef vSpinColourVector spinor; diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index 4eeb8c27..cb30faad 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -98,9 +98,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index 009f50b3..81a1b8c4 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -100,9 +100,9 @@ int main (int argc, char ** argv) // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 670e7589..0b0ba346 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -110,9 +110,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index d762e22a..b39fdd14 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -119,9 +119,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc index 66ae9dcf..58258a5e 100644 --- a/tests/forces/Test_dwf_gpforce_eofa.cc +++ b/tests/forces/Test_dwf_gpforce_eofa.cc @@ -114,9 +114,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc index c4e214bb..21f0b9d0 100644 --- a/tests/forces/Test_gp_plaq_force.cc +++ b/tests/forces/Test_gp_plaq_force.cc @@ -85,9 +85,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index 2573af6a..bb4ea6de 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto Uprime_v= Uprime.View(); - auto U_v = U.View(); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index 09a1dc4b..bdc332d9 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index cd30d898..1c85a5d9 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -99,9 +99,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index a1c4e930..11e69652 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index f71e2d41..f85501fa 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -112,9 +112,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 4975f36d..68163e63 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -115,9 +115,9 @@ int main (int argc, char** argv) SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); // fourth order exponential approx thread_foreach( i, mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt + mom_v[i](mu) *mom_v[i](mu) *U_v[i](mu)*(dt*dt/2.0) diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 3ea2c6aa..17dce530 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc index 9a78de24..ed72f2c0 100644 --- a/tests/forces/Test_rect_force.cc +++ b/tests/forces/Test_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index 47f1516a..c8b3a7f4 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu); Uprime_v[i](mu) += mom_v[i](mu)*U_v[i](mu)*dt ; diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index d9ace23c..f26f0ac9 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -105,9 +105,9 @@ int main(int argc, char **argv) Hmom -= real(sum(trace(mommu * mommu))); PokeIndex(mom, mommu, mu); - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(ss,mom_v, { Uprime_v[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom_v[ss]._internal[mu], dt, 12) * U_v[ss]._internal[mu]); diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index 2730885f..e24ae601 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -114,9 +114,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc index 6fa90f32..3b8cdda6 100644 --- a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc @@ -35,7 +35,7 @@ directory int main(int argc, char **argv) { -#ifndef GRID_NVCC +#ifndef GRID_CUDA using namespace Grid; diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc index f8477220..31697c12 100644 --- a/tests/hmc/Test_multishift_sqrt.cc +++ b/tests/hmc/Test_multishift_sqrt.cc @@ -31,7 +31,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; template class DumbOperator : public LinearOperatorBase { public: @@ -57,7 +56,7 @@ public: // Support for coarsening to a multigrid void OpDiag (const Field &in, Field &out) {}; void OpDir (const Field &in, Field &out,int dir,int disp){}; - void OpDirAll (const Field &in, std::vector &out) {}; // Abstract base + void OpDirAll (const Field &in, std::vector &out) {}; void Op (const Field &in, Field &out){ out = scale * in; @@ -105,7 +104,7 @@ int main (int argc, char ** argv) GridDefaultMpi()); double lo=0.001; - double hi=1.0; + double hi=20.0; int precision=64; int degree=10; AlgRemez remez(lo,hi,precision); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 873530ff..8e083231 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -1,5 +1,3 @@ - - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -302,8 +300,8 @@ int main (int argc, char ** argv) int nb=nbasisc/2; CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0); for(int n=0;noSites();site++){ subspace_g5[site](nn) = subspace[site](nn); diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc new file mode 100644 index 00000000..9e11c160 --- /dev/null +++ b/tests/solver/Test_dwf_multigrid.cc @@ -0,0 +1,594 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, + FineOperator &Fine, + FineSmoother &Smoother, + Guesser &Guess_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _CoarseOperator(Coarse), + _FineOperator(Fine), + _Smoother(Smoother), + _Guess(Guess_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + CoarseVector Csrc(_CoarseOperator.Grid()); + CoarseVector Csol(_CoarseOperator.Grid()); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0); + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOpPV(Dpv); + + std::cout< CoarseBiCGSTAB(tol,MaxIt); + ConjugateGradient CoarseCG(tol,MaxIt); + // GeneralisedMinimalResidual CoarseGMRES(tol,MaxIt,20); + + BiCGSTAB FineBiCGSTAB(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + // GeneralisedMinimalResidual FineGMRES(tol,MaxIt,20); + + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + PVdagMLinearOperator FinePVdagM(Ddwf,Dpv);// M_{pv}^\dag M + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + SchurDiagOneOperator FineDiagOne(Ddwf); // 1 - M_ee^{-1} Meo Moo^{-1} Moe e + + MdagMLinearOperator CoarseMdagM(LDOp); + PVdagMLinearOperator CoarsePVdagM(LDOp,LDOpPV); + + std::cout< IRLCheby(0.03,12.0,71); // 1 iter + FunctionHermOp IRLOpCheby(IRLCheby,CoarseMdagM); + PlainHermOp IRLOp (CoarseMdagM); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + + std::cout< DeflCoarseGuesser(evec,eval); + NormalEquations DeflCoarseCGNE (LDOp,CoarseCG,DeflCoarseGuesser); + c_res=Zero(); + DeflCoarseCGNE(c_src,c_res); + + + std::cout< CoarseMgridCG(0.001,1000); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + + typedef HDCRPreconditioner > TwoLevelHDCR; + TwoLevelHDCR TwoLevelPrecon(Aggregates, + HermIndefOp, + FineSmoother, + DeflCoarseCGNE); + TwoLevelPrecon.Level(1); + // PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + PrecGeneralisedConjugateResidualNonHermitian l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + + f_res=Zero(); + + CoarseCG.Tolerance=0.02; + l1PGCR(f_src,f_res); + + std::cout< CoarseMgridBiCGSTAB(0.01,1000); + BiCGSTAB FineMgridBiCGSTAB(0.0,24); + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser FineZeroGuesser; + + SolverWrapper FineBiCGSmoother( FinePVdagM, FineMgridBiCGSTAB, FineZeroGuesser); + SolverWrapper CoarsePVdagMSolver(CoarsePVdagM,CoarseMgridBiCGSTAB,CoarseZeroGuesser); + typedef HDCRPreconditioner > TwoLevelMG; + + TwoLevelMG _TwoLevelMG(Aggregates, + FinePVdagM, + FineBiCGSmoother, + CoarsePVdagMSolver); + _TwoLevelMG.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian pvPGCR(1.0e-8,100,FinePVdagM,_TwoLevelMG,16,16); + pvPGCR.Level(1); + + f_res=Zero(); + pvPGCR(f_src,f_res); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +//#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + const int nbasis= 32; + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< SubspaceOp(Dw); + + Subspace Aggregates4D(Coarse4d,UGrid,0); + Subspace Aggregates5D(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + std::cout< Level1Op; + + + NonHermitianLinearOperator LinOpDwf(Ddwf); + + Level1Op LDOp (*Coarse5d,1); + LDOp.CoarsenOperator(FGrid,LinOpDwf,Aggregates5D); + + std::cout<