diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 495a81e1..2209f960 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -47,10 +47,9 @@ Author: paboyle #include #include #include -#include -#include +#include #include -#include +#include #include #include #include diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 16cfcf50..ecb561ea 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -6,6 +6,7 @@ /////////////////// #include #include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9d779e05..9556c03d 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -18,19 +18,20 @@ #pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__NVCC__") #pragma push_macro("__CUDACC__") +#undef __CUDA_ARCH__ #undef __NVCC__ #undef __CUDACC__ -#undef __CUDA_ARCH__ #define __NVCC__REDEFINE__ #endif /* SYCL save and restore compile environment*/ -#ifdef __SYCL_DEVICE_ONLY__ +#ifdef GRID_SYCL #pragma push #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ -#undef EIGEN_USE_SYCL #define EIGEN_DONT_VECTORIZE +//#undef EIGEN_USE_SYCL +#define __SYCL__REDEFINE__ #endif @@ -41,7 +42,7 @@ #ifdef __NVCC__REDEFINE__ #pragma pop_macro("__CUDACC__") #pragma pop_macro("__NVCC__") -#pragma pop_macro("__CUDA_ARCH__") +#pragma pop_macro("GRID_SIMT") #pragma pop #endif diff --git a/Grid/Makefile.am b/Grid/Makefile.am index b88ea4f2..f1fa462e 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,7 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif -all: version-cache +all: version-cache Version.h version-cache: @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @@ -42,7 +42,7 @@ version-cache: fi;\ rm -f vertmp -Version.h: +Version.h: version-cache cp version-cache Version.h .PHONY: version-cache diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 48ea194b..7f27784b 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -29,9 +29,11 @@ Author: Peter Boyle #ifndef GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H +NAMESPACE_CHECK(algorithms); #include #include #include +NAMESPACE_CHECK(SparseMatrix); #include #include @@ -41,10 +43,12 @@ Author: Peter Boyle #include #include #include - +NAMESPACE_CHECK(approx); #include #include +NAMESPACE_CHECK(ConjGrad); #include +NAMESPACE_CHECK(BiCGSTAB); #include #include #include @@ -62,7 +66,9 @@ Author: Peter Boyle #include #include +NAMESPACE_CHECK(PowerMethod); #include +NAMESPACE_CHECK(CoarsendMatrix); #include #endif diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 4c26f799..8d184aea 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -1,14 +1,3 @@ - // blockZaxpy in bockPromote - 3s, 5% - // noncoalesced linalg in Preconditionoer ~ 3s 5% - // Lancos tuning or replace 10-20s ~ 25%, open ended - // setup tuning 5s ~ 8% - // -- e.g. ordermin, orderstep tunables. - // MdagM path without norm in LinOp code. few seconds - - // Mdir calc blocking kernels - // Fuse kernels in blockMaskedInnerProduct - // preallocate Vectors in Cayley 5D ~ few percent few seconds - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -91,34 +80,7 @@ public: } directions [2*_d]=0; displacements[2*_d]=0; - - //// report back - std::cout< GetDelta(int point) { - std::vector delta(dimension,0); - delta[directions[point]] = displacements[point]; - return delta; - }; - */ }; @@ -149,25 +111,7 @@ public: CoarseScalar InnerProd(CoarseGrid); std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<oSites(),1,{ - eProj[ss](i)=CComplex(1.0); - }); - eProj=eProj - iProj; - std::cout< &hermop,int nn=nbasis) { @@ -190,12 +129,12 @@ public: FineField Mn(FineGrid); for(int b=0;b "< &hermop, int nn, double hi, @@ -280,10 +219,10 @@ public: hermop.HermOp(*Tn,y); - auto y_v = y.View(); - auto Tn_v = Tn->View(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); + autoView( y_v , y, AcceleratorWrite); + autoView( Tn_v , (*Tn), AcceleratorWrite); + autoView( Tnp_v , (*Tnp), AcceleratorWrite); + autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); @@ -313,201 +252,6 @@ public: } assert(b==nn); } -#endif -#if 0 - virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, - int nn, - double hi, - double lo, - int orderfilter, - int ordermin, - int orderstep, - double filterlo - ) { - - RealD scale; - - FineField noise(FineGrid); - FineField Mn(FineGrid); - FineField tmp(FineGrid); - FineField combined(FineGrid); - - // New normalised noise - gaussian(RNG,noise); - scale = std::pow(norm2(noise),-0.5); - noise=noise*scale; - - // Initial matrix element - hermop.Op(noise,Mn); std::cout< "< Cheb(llo,hhi,oorder); \ - Cheb(hermop,noise,Mn); \ - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ - subspace[b] = Mn; \ - hermop.Op(Mn,tmp); \ - std::cout< "< Cheb(0.002,60.0,1500,-0.5,3.5); \ - - RealD alpha=-0.8; - RealD beta =-0.8; -#define FILTER(llo,hhi,oorder) \ - { \ - Chebyshev Cheb(llo,hhi,oorder); \ - /* JacobiPolynomial Cheb(0.0,60.0,oorder,alpha,beta);*/\ - Cheb(hermop,noise,Mn); \ - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \ - subspace[b] = Mn; \ - hermop.Op(Mn,tmp); \ - std::cout< "< Cheb(llo,hhi,oorder); \ - Cheb(hermop,noise,combined); \ - } - - double node = 0.000; - FILTERb(lo,hi,orderfilter);// 0 - // FILTERc(node,hi,51);// 0 - noise = Mn; - int base = 0; - int mult = 100; - FILTER(node,hi,base+1*mult); - FILTER(node,hi,base+2*mult); - FILTER(node,hi,base+3*mult); - FILTER(node,hi,base+4*mult); - FILTER(node,hi,base+5*mult); - FILTER(node,hi,base+6*mult); - FILTER(node,hi,base+7*mult); - FILTER(node,hi,base+8*mult); - FILTER(node,hi,base+9*mult); - FILTER(node,hi,base+10*mult); - FILTER(node,hi,base+11*mult); - FILTER(node,hi,base+12*mult); - FILTER(node,hi,base+13*mult); - FILTER(node,hi,base+14*mult); - FILTER(node,hi,base+15*mult); - assert(b==nn); - } -#endif - -#if 0 - virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase &hermop, - int nn, - double hi, - double lo, - int orderfilter, - int ordermin, - int orderstep, - double filterlo - ) { - - RealD scale; - - FineField noise(FineGrid); - FineField Mn(FineGrid); - FineField tmp(FineGrid); - FineField combined(FineGrid); - - // New normalised noise - gaussian(RNG,noise); - scale = std::pow(norm2(noise),-0.5); - noise=noise*scale; - - // Initial matrix element - hermop.Op(noise,Mn); std::cout< "< JacobiPoly(0.005,60.,1500); - // JacobiPolynomial JacobiPoly(0.002,60.0,1500,-0.5,3.5); - //JacobiPolynomial JacobiPoly(0.03,60.0,500,-0.5,3.5); - // JacobiPolynomial JacobiPoly(0.00,60.0,1000,-0.5,3.5); - JacobiPoly(hermop,noise,Mn); - scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; - subspace[b] = Mn; - hermop.Op(Mn,tmp); - std::cout< "< "< Stencil; std::vector A; - + /////////////////////// // Interface /////////////////////// @@ -549,13 +293,13 @@ public: SimpleCompressor compressor; Stencil.HaloExchange(in,compressor); - - auto in_v = in.View(); - auto out_v = out.View(); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); typedef LatticeView Aview; - + Vector AcceleratorViewContainer; - for(int p=0;p_is_local) { - nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); } - synchronise(); + acceleratorSynchronise(); for(int bb=0;bb Aview; Vector AcceleratorViewContainer; - for(int p=0;p_is_local) { - nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); } - synchronise(); + acceleratorSynchronise(); for(int bb=0;bboSites(),1,{ - - siteVector res = Zero(); - siteVector nbr; - int ptype; - StencilEntry *SE; - - SE=Stencil.GetEntry(ptype,point,ss); - - if(SE->_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; - } else { - nbr = Stencil.CommBuf()[SE->_offset]; - } - synchronise(); - - res = res + Aview_p[point][ss]*nbr; - - out_v[ss]=res; - }); -#endif + for(int p=0;p &out) { @@ -841,10 +562,10 @@ public: blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); - auto iZProj_v = iZProj.View() ; - auto oZProj_v = oZProj.View() ; - auto A_p = A[p].View(); - auto A_self = A[self_stencil].View(); + autoView( iZProj_v , iZProj, AcceleratorRead) ; + autoView( oZProj_v , oZProj, AcceleratorRead) ; + autoView( A_p , A[p], AcceleratorWrite); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); @@ -860,11 +581,11 @@ public: mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); { - auto tmp_ = tmp.View(); - auto evenmask_ = evenmask.View(); - auto oddmask_ = oddmask.View(); - auto Mphie_ = Mphie.View(); - auto Mphio_ = Mphio.View(); + autoView( tmp_ , tmp, AcceleratorWrite); + autoView( evenmask_ , evenmask, AcceleratorRead); + autoView( oddmask_ , oddmask, AcceleratorRead); + autoView( Mphie_ , Mphie, AcceleratorRead); + autoView( Mphio_ , Mphio, AcceleratorRead); accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); }); @@ -872,8 +593,8 @@ public: blockProject(SelfProj,tmp,Subspace.subspace); - auto SelfProj_ = SelfProj.View(); - auto A_self = A[self_stencil].View(); + autoView( SelfProj_ , SelfProj, AcceleratorRead); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j bc(FineGrid->_ndimension,0); - - blockPick(Grid(),phi,tmp,bc); // Pick out a block - linop.Op(tmp,Mphi); // Apply big dop - blockProject(iProj,Mphi,Subspace.subspace); // project it and print it - std::cout< #endif #endif - NAMESPACE_BEGIN(Grid); template struct FFTW { }; @@ -191,7 +189,7 @@ public: typedef typename sobj::scalar_type scalar; Lattice pgbuf(&pencil_g); - auto pgbuf_v = pgbuf.View(); + autoView(pgbuf_v , pgbuf, CpuWrite); typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -232,15 +230,18 @@ public: result = source; int pc = processor_coor[dim]; for(int p=0;plSites(),{ + { + autoView(r_v,result,CpuRead); + autoView(p_v,pgbuf,CpuWrite); + thread_for(idx, sgrid->lSites(),{ Coordinate cbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,cbuf); - peekLocalSite(s,result,cbuf); + peekLocalSite(s,r_v,cbuf); cbuf[dim]+=((pc+p) % processors[dim])*L; - // cbuf[dim]+=p*L; - pokeLocalSite(s,pgbuf,cbuf); - }); + pokeLocalSite(s,p_v,cbuf); + }); + } if (p != processors[dim] - 1) { result = Cshift(result,dim,L); } @@ -269,15 +270,19 @@ public: flops+= flops_call*NN; // writing out result - thread_for(idx,sgrid->lSites(),{ + { + autoView(pgbuf_v,pgbuf,CpuRead); + autoView(result_v,result,CpuWrite); + thread_for(idx,sgrid->lSites(),{ Coordinate clbuf(Nd), cgbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,clbuf); cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; - peekLocalSite(s,pgbuf,cgbuf); - pokeLocalSite(s,result,clbuf); - }); + peekLocalSite(s,pgbuf_v,cgbuf); + pokeLocalSite(s,result_v,clbuf); + }); + } result = result*div; // destroying plan diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h index 3a7be1ef..f4e5cdda 100644 --- a/Grid/algorithms/iterative/BiCGSTAB.h +++ b/Grid/algorithms/iterative/BiCGSTAB.h @@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction LinearCombTimer.Start(); bo = beta * omega; - auto p_v = p.View(); - auto r_v = r.View(); - auto v_v = v.View(); - accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction alpha = rho / Calpha.real(); LinearCombTimer.Start(); - auto h_v = h.View(); - auto psi_v = psi.View(); - accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); - }); - - auto s_v = s.View(); - accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorRead); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + autoView( psi_v,psi, AcceleratorRead); + autoView( h_v , h, AcceleratorWrite); + autoView( s_v , s, AcceleratorWrite); + accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); + }); + accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction omega = Comega.real() / norm2(t); LinearCombTimer.Start(); - auto t_v = t.View(); - accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); - coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); - }); + { + autoView( psi_v,psi, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); + autoView( h_v , h, AcceleratorRead); + autoView( s_v , s, AcceleratorRead); + autoView( t_v , t, AcceleratorRead); + accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); + coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); + }); + } LinearCombTimer.Stop(); - + cp = norm2(r); LinalgTimer.Stop(); diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 3a2544b5..14f3d306 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -140,13 +140,15 @@ public: b = cp / c; LinearCombTimer.Start(); - auto psi_v = psi.View(); - auto p_v = p.View(); - auto r_v = r.View(); - accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); - coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); - }); + { + autoView( psi_v , psi, AcceleratorWrite); + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); + accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); + coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h new file mode 100644 index 00000000..22b7725e --- /dev/null +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -0,0 +1,241 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_PREC_GCR_NON_HERM_H +#define GRID_PREC_GCR_NON_HERM_H + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +//VPGCR Abe and Zhang, 2005. +//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING +//Computing and Information Volume 2, Number 2, Pages 147-161 +//NB. Likely not original reference since they are focussing on a preconditioner variant. +// but VPGCR was nicely written up in their paper +/////////////////////////////////////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); + +#define GCRLogLevel std::cout << GridLogMessage < +class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction { +public: + + RealD Tolerance; + Integer MaxIterations; + int verbose; + int mmax; + int nstep; + int steps; + int level; + GridStopWatch PrecTimer; + GridStopWatch MatTimer; + GridStopWatch LinalgTimer; + + LinearFunction &Preconditioner; + LinearOperatorBase &Linop; + + void Level(int lv) { level=lv; }; + + PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase &_Linop,LinearFunction &Prec,int _mmax,int _nstep) : + Tolerance(tol), + MaxIterations(maxit), + Linop(_Linop), + Preconditioner(Prec), + mmax(_mmax), + nstep(_nstep) + { + level=1; + verbose=1; + }; + + void operator() (const Field &src, Field &psi){ + + psi=Zero(); + RealD cp, ssq,rsq; + ssq=norm2(src); + rsq=Tolerance*Tolerance*ssq; + + Field r(src.Grid()); + + PrecTimer.Reset(); + MatTimer.Reset(); + LinalgTimer.Reset(); + + GridStopWatch SolverTimer; + SolverTimer.Start(); + + steps=0; + for(int k=0;k q(mmax,grid); + std::vector p(mmax,grid); + std::vector qq(mmax); + + GCRLogLevel<< "PGCR nStep("<(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. + for(int back=0;back=0); + + b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; + p[peri_kp]=p[peri_kp]+b*p[peri_back]; + q[peri_kp]=q[peri_kp]+b*q[peri_back]; + + } + qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm + LinalgTimer.Stop(); + } + assert(0); // never reached + return cp; + } +}; +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index ef6459ed..0d1707d9 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; -int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; -#ifdef GRID_CUDA -int PointerCache::Ncache = 32; -#else -int PointerCache::Ncache = 8; -#endif -int PointerCache::Victim; -int PointerCache::VictimSmall; -PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; -PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax]; - -void PointerCache::Init(void) -{ - char * str; - - str= getenv("GRID_ALLOC_NCACHE_LARGE"); - if ( str ) Ncache = atoi(str); - if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; - - str= getenv("GRID_ALLOC_NCACHE_SMALL"); - if ( str ) NcacheSmall = atoi(str); - if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; - - // printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax); -} -void *PointerCache::Insert(void *ptr,size_t bytes) -{ - if (bytes < GRID_ALLOC_SMALL_LIMIT ) - return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall); - return Insert(ptr,bytes,Entries,Ncache,Victim); -} -void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) -{ -#ifdef GRID_OMP - assert(omp_in_parallel()==0); -#endif - - void * ret = NULL; - int v = -1; - - for(int e=0;e See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#ifndef GRID_ALIGNED_ALLOCATOR_H -#define GRID_ALIGNED_ALLOCATOR_H - -#ifdef HAVE_MALLOC_MALLOC_H -#include -#endif -#ifdef HAVE_MALLOC_H -#include -#endif - -#ifdef HAVE_MM_MALLOC_H -#include -#endif - -#define POINTER_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) -#define GRID_ALLOC_SMALL_LIMIT (4096) +#pragma once NAMESPACE_BEGIN(Grid); -// Move control to configure.ac and Config.h? - -class PointerCache { -private: -/*Pinning pages is costly*/ -/*Could maintain separate large and small allocation caches*/ -/* Could make these configurable, perhaps up to a max size*/ - static const int NcacheSmallMax=128; - static const int NcacheMax=16; - static int NcacheSmall; - static int Ncache; - - typedef struct { - void *address; - size_t bytes; - int valid; - } PointerCacheEntry; - - static PointerCacheEntry Entries[NcacheMax]; - static int Victim; - static PointerCacheEntry EntriesSmall[NcacheSmallMax]; - static int VictimSmall; - -public: - static void Init(void); - static void *Insert(void *ptr,size_t bytes) ; - static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; - static void *Lookup(size_t bytes) ; - static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ; -}; - -std::string sizeString(size_t bytes); - -struct MemoryStats -{ - size_t totalAllocated{0}, maxAllocated{0}, - currentlyAllocated{0}, totalFreed{0}; -}; - -class MemoryProfiler -{ -public: - static MemoryStats *stats; - static bool debug; -}; - -#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" -#define profilerDebugPrint \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ - << std::endl; \ - std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ - << std::endl; \ - } - -#define profilerAllocate(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalAllocated += (bytes); \ - s->currentlyAllocated += (bytes); \ - s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -#define profilerFree(bytes) \ - if (MemoryProfiler::stats) \ - { \ - auto s = MemoryProfiler::stats; \ - s->totalFreed += (bytes); \ - s->currentlyAllocated -= (bytes); \ - } \ - if (MemoryProfiler::debug) \ - { \ - std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ - profilerDebugPrint; \ - } - -void check_huge_pages(void *Buf,uint64_t BYTES); - -//////////////////////////////////////////////////////////////////// -// A lattice of something, but assume the something is SIMDized. -//////////////////////////////////////////////////////////////////// - template class alignedAllocator { public: @@ -164,71 +53,60 @@ public: { size_type bytes = __n*sizeof(_Tp); profilerAllocate(bytes); - - -#ifdef POINTER_CACHE - _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); -#else - pointer ptr = nullptr; -#endif - -#ifdef GRID_NVCC - //////////////////////////////////// - // Unified (managed) memory - //////////////////////////////////// - if ( ptr == (_Tp *) NULL ) { - // printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout); - // auto err = -gridMallocManaged((void **)&ptr,bytes); -/*if( err != cudaSuccess ) { - ptr = (_Tp *) NULL; - std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " < inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } +template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } -#ifdef GRID_NVCC - if ( __freeme ) gridFree((void *)__freeme); -#else - #ifdef HAVE_MM_MALLOC_H - if ( __freeme ) _mm_free((void *)__freeme); - #else - if ( __freeme ) free((void *)__freeme); - #endif -#endif +template +class uvmAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef uvmAllocator<_Tp1> other; }; + uvmAllocator() throw() { } + uvmAllocator(const uvmAllocator&) throw() { } + template uvmAllocator(const uvmAllocator<_Tp1>&) throw() { } + ~uvmAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::SharedFree((void *)__p,bytes); } // FIXME: hack for the copy constructor, eventually it must be avoided @@ -237,17 +115,17 @@ gridMallocManaged((void **)&ptr,bytes); void construct(pointer __p) { }; void destroy(pointer __p) { }; }; -template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } -template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } +template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = alignedAllocator; -template using Vector = std::vector >; -template using commVector = std::vector >; -template using Matrix = std::vector > >; +template using commAllocator = uvmAllocator; +template using Vector = std::vector >; +template using commVector = std::vector >; +//template using Matrix = std::vector > >; NAMESPACE_END(Grid); -#endif + diff --git a/Grid/allocator/Allocator.h b/Grid/allocator/Allocator.h new file mode 100644 index 00000000..589ea36f --- /dev/null +++ b/Grid/allocator/Allocator.h @@ -0,0 +1,4 @@ +#pragma once +#include +#include +#include diff --git a/Grid/allocator/GridMemoryManager.cc b/Grid/allocator/GridMemoryManager.cc deleted file mode 100644 index 369f72f7..00000000 --- a/Grid/allocator/GridMemoryManager.cc +++ /dev/null @@ -1,145 +0,0 @@ -#include - -NAMESPACE_BEGIN(Grid); - -#define _GRID_MEM_PAGE_SIZE 4096 -void* _grid_mem_base = 0; -size_t _grid_mem_pages; -struct _grid_mem_range { - size_t page_start, page_end; -}; -std::vector<_grid_mem_range> _grid_mem_avail; -std::map _grid_mem_alloc; - -void gridMemoryInit() { -#ifdef GRID_NVCC - size_t free,total; - cudaMemGetInfo(&free,&total); - - char* ev = getenv("GRID_DEVICE_BYTES_FOR_CACHE"); - if (ev) { - long bytes; - assert(sscanf(ev,"%ld",&bytes)==1); - free -= bytes; - } - - _grid_mem_pages = free / _GRID_MEM_PAGE_SIZE; - size_t sz = _grid_mem_pages * _GRID_MEM_PAGE_SIZE; - - assert(cudaSuccess == cudaMallocManaged(&_grid_mem_base,sz)); - - int target; - cudaGetDevice(&target); - cudaMemAdvise(_grid_mem_base,sz,cudaMemAdviseSetPreferredLocation,target); - - assert(cudaSuccess == cudaMemset(_grid_mem_base,0,sz)); // touch on device - std::cout << GridLogMessage << "gridMemoryInit: " << sz << " bytes" << std::endl; - - _grid_mem_avail.push_back( { 0, _grid_mem_pages } ); -#endif -} - -void gridMallocManaged(void** pp, size_t sz) { -#ifdef GRID_NVCC - if (_grid_mem_avail.empty()) - gridMemoryInit(); - - size_t pages = (sz + _GRID_MEM_PAGE_SIZE - 1) / _GRID_MEM_PAGE_SIZE; - // find free block - size_t m; - for (m=0;m<_grid_mem_avail.size();m++) { - auto & b = _grid_mem_avail[m]; - if (b.page_end - b.page_start >= pages) - break; - } - if (m == _grid_mem_avail.size()) { - std::cout << GridLogMessage << "Out of memory" << std::endl; - assert(0); - } - *pp = (char*)_grid_mem_base + _GRID_MEM_PAGE_SIZE*_grid_mem_avail[m].page_start; - _grid_mem_alloc[*pp] = { _grid_mem_avail[m].page_start, _grid_mem_avail[m].page_start + pages }; - _grid_mem_avail[m].page_start += pages; -#else - *pp = malloc(sz); -#endif -} - -void gridFree(void* p) { -#ifdef GRID_NVCC - if (_grid_mem_avail.empty()) - gridMemoryInit(); - - auto & alloc = _grid_mem_alloc[p]; - if (alloc.page_start == alloc.page_end) { - free(p); - //cudaFreeHost(p); - } else { - // can we enlarge existing one? - for (size_t m=0;m<_grid_mem_avail.size();m++) { - auto & b = _grid_mem_avail[m]; - if (b.page_start == alloc.page_end) { - b.page_start = alloc.page_start; - return; - } - if (b.page_end == alloc.page_start) { - b.page_end = alloc.page_end; - return; - } - } - // fragment memory - _grid_mem_avail.push_back( alloc ); - } - _grid_mem_alloc.erase(p); -#else - free(p); -#endif -} - -void gridAcceleratorPrefetch(void* p, size_t sz) { -#ifdef GRID_NVCC - auto & alloc = _grid_mem_alloc[p]; - if (alloc.page_start == alloc.page_end) // pinned to host - return; - - int target; - cudaGetDevice(&target); - cudaMemPrefetchAsync(p,sz,target); -#endif -} - -void gridMemGetInfo(size_t* pfree, size_t* ptotal) { -#ifdef GRID_NVCC - if (_grid_mem_avail.empty()) - gridMemoryInit(); - - *ptotal = _grid_mem_pages * _GRID_MEM_PAGE_SIZE; - *pfree = 0; - for (auto & a : _grid_mem_avail) - *pfree += (a.page_end - a.page_start) * _GRID_MEM_PAGE_SIZE; -#else - *pfree = 0; - *ptotal = 0; -#endif -} - -void gridMoveToHost(void** pp) { -#ifdef GRID_NVCC - if (_grid_mem_avail.empty()) - gridMemoryInit(); - - auto & alloc = _grid_mem_alloc[*pp]; - if (alloc.page_start == alloc.page_end) // already on host - return; - - size_t sz = (alloc.page_end - alloc.page_start) * _GRID_MEM_PAGE_SIZE; - void*pn; - //assert(cudaSuccess == cudaMallocHost(&pn,sz)); - pn = malloc(sz); - memcpy(pn,*pp,sz); - gridFree(*pp); - *pp = pn; - _grid_mem_alloc[pn] = { 0,0 }; -#endif -} - -NAMESPACE_END(Grid); diff --git a/Grid/allocator/GridMemoryManager.h b/Grid/allocator/GridMemoryManager.h deleted file mode 100644 index 9e619301..00000000 --- a/Grid/allocator/GridMemoryManager.h +++ /dev/null @@ -1,42 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/GridMemoryManager.h - - Copyright (C) 2020 - -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#ifndef GRID_MEMORY_MANAGER_H -#define GRID_MEMORY_MANAGER_H - -NAMESPACE_BEGIN(Grid); - -void gridMemoryInit(); -void gridMallocManaged(void** pp, size_t sz); -void gridMoveToHost(void** pp); -void gridAcceleratorPrefetch(void* p, size_t sz); -void gridMemGetInfo(size_t* pfree, size_t* ptotal); -void gridFree(void* p); - -NAMESPACE_END(Grid); - -#endif diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc new file mode 100644 index 00000000..a2866507 --- /dev/null +++ b/Grid/allocator/MemoryManager.cc @@ -0,0 +1,249 @@ +#include + +NAMESPACE_BEGIN(Grid); + +/*Allocation types, saying which pointer cache should be used*/ +#define Cpu (0) +#define CpuSmall (1) +#define Acc (2) +#define AccSmall (3) +#define Shared (4) +#define SharedSmall (5) +uint64_t total_shared; +uint64_t total_device; +uint64_t total_host;; +void MemoryManager::PrintBytes(void) +{ + std::cout << " MemoryManager : "<=0) && (Nc < NallocCacheMax)) { + Ncache[Cpu]=Nc; + Ncache[Acc]=Nc; + Ncache[Shared]=Nc; + } + } + + str= getenv("GRID_ALLOC_NCACHE_SMALL"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuSmall]=Nc; + Ncache[AccSmall]=Nc; + Ncache[SharedSmall]=Nc; + } + } + + // only root node delivers messages, this is called before communicator is initialized, + // so need a manual restriction + if ( CartesianCommunicator::RankWorld() == 0 ) { + std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + + void * ret = NULL; + int v = -1; + + for(int e=0;e0); +#ifdef GRID_OMP + assert(omp_in_parallel()==0); +#endif + for(int e=0;e +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once +#include +#include + +NAMESPACE_BEGIN(Grid); + +// Move control to configure.ac and Config.h? + +#define ALLOCATION_CACHE +#define GRID_ALLOC_ALIGN (2*1024*1024) +#define GRID_ALLOC_SMALL_LIMIT (4096) + +/*Pinning pages is costly*/ +//////////////////////////////////////////////////////////////////////////// +// Advise the LatticeAccelerator class +//////////////////////////////////////////////////////////////////////////// +enum ViewAdvise { + AdviseDefault = 0x0, // Regular data + AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + + // AdviseTransient = 0x2, // Data will mostly be read. On some architectures + // enables read-only copies of memory to be kept on + // host and device. + + // AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device + +}; + +//////////////////////////////////////////////////////////////////////////// +// View Access Mode +//////////////////////////////////////////////////////////////////////////// +enum ViewMode { + AcceleratorRead = 0x01, + AcceleratorWrite = 0x02, + AcceleratorWriteDiscard = 0x04, + CpuRead = 0x08, + CpuWrite = 0x10, + CpuWriteDiscard = 0x10 // same for now +}; + +class MemoryManager { +private: + + //////////////////////////////////////////////////////////// + // For caching recently freed allocations + //////////////////////////////////////////////////////////// + typedef struct { + void *address; + size_t bytes; + int valid; + } AllocationCacheEntry; + + static const int NallocCacheMax=128; + static const int NallocType=6; + static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; + static int Victim[NallocType]; + static int Ncache[NallocType]; + + ///////////////////////////////////////////////// + // Free pool + ///////////////////////////////////////////////// + static void *Insert(void *ptr,size_t bytes,int type) ; + static void *Lookup(size_t bytes,int type) ; + static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; + static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; + + static void *AcceleratorAllocate(size_t bytes); + static void AcceleratorFree (void *ptr,size_t bytes); + static void PrintBytes(void); + public: + static void Init(void); + static void *SharedAllocate(size_t bytes); + static void SharedFree (void *ptr,size_t bytes); + static void *CpuAllocate(size_t bytes); + static void CpuFree (void *ptr,size_t bytes); + + //////////////////////////////////////////////////////// + // Footprint tracking + //////////////////////////////////////////////////////// + static uint64_t DeviceBytes; + static uint64_t DeviceLRUBytes; + static uint64_t DeviceMaxBytes; + static uint64_t HostToDeviceBytes; + static uint64_t DeviceToHostBytes; + static uint64_t HostToDeviceXfer; + static uint64_t DeviceToHostXfer; + + private: +#ifndef GRID_UVM + ////////////////////////////////////////////////////////////////////// + // Data tables for ViewCache + ////////////////////////////////////////////////////////////////////// + typedef std::list LRU_t; + typedef typename LRU_t::iterator LRUiterator; + typedef struct { + int LRU_valid; + LRUiterator LRU_entry; + uint64_t CpuPtr; + uint64_t AccPtr; + size_t bytes; + uint32_t transient; + uint32_t state; + uint32_t accLock; + uint32_t cpuLock; + } AcceleratorViewEntry; + + typedef std::unordered_map AccViewTable_t; + typedef typename AccViewTable_t::iterator AccViewTableIterator ; + + static AccViewTable_t AccViewTable; + static LRU_t LRU; + + ///////////////////////////////////////////////// + // Device motion + ///////////////////////////////////////////////// + static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void EvictVictims(uint64_t bytes); // Frees up + static void Evict(AcceleratorViewEntry &AccCache); + static void Flush(AcceleratorViewEntry &AccCache); + static void Clone(AcceleratorViewEntry &AccCache); + static void AccDiscard(AcceleratorViewEntry &AccCache); + static void CpuDiscard(AcceleratorViewEntry &AccCache); + + // static void LRUupdate(AcceleratorViewEntry &AccCache); + static void LRUinsert(AcceleratorViewEntry &AccCache); + static void LRUremove(AcceleratorViewEntry &AccCache); + + // manage entries in the table + static int EntryPresent(uint64_t CpuPtr); + static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void EntryErase (uint64_t CpuPtr); + static AccViewTableIterator EntryLookup(uint64_t CpuPtr); + static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry); + + static void AcceleratorViewClose(uint64_t AccPtr); + static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static void CpuViewClose(uint64_t Ptr); + static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); +#endif + static void NotifyDeletion(void * CpuPtr); + + public: + static void Print(void); + static int isOpen (void* CpuPtr); + static void ViewClose(void* CpuPtr,ViewMode mode); + static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + +}; + +NAMESPACE_END(Grid); + + diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc new file mode 100644 index 00000000..5dd7575e --- /dev/null +++ b/Grid/allocator/MemoryManagerCache.cc @@ -0,0 +1,468 @@ +#include + +#ifndef GRID_UVM + +#warning "Using explicit device memory copies" +NAMESPACE_BEGIN(Grid); +#define dprintf(...) + +//////////////////////////////////////////////////////////// +// For caching copies of data on device +//////////////////////////////////////////////////////////// +MemoryManager::AccViewTable_t MemoryManager::AccViewTable; +MemoryManager::LRU_t MemoryManager::LRU; + +//////////////////////////////////////////////////////// +// Footprint tracking +//////////////////////////////////////////////////////// +uint64_t MemoryManager::DeviceBytes; +uint64_t MemoryManager::DeviceLRUBytes; +uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128; +uint64_t MemoryManager::HostToDeviceBytes; +uint64_t MemoryManager::DeviceToHostBytes; +uint64_t MemoryManager::HostToDeviceXfer; +uint64_t MemoryManager::DeviceToHostXfer; + +//////////////////////////////////// +// Priority ordering for unlocked entries +// Empty +// CpuDirty +// Consistent +// AccDirty +//////////////////////////////////// +#define Empty (0x0) /*Entry unoccupied */ +#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/ +#define Consistent (0x2) /*ACC copy AND CPU copy are valid */ +#define AccDirty (0x4) /*ACC copy is golden */ +#define EvictNext (0x8) /*Priority for eviction*/ + +///////////////////////////////////////////////// +// Mechanics of data table maintenance +///////////////////////////////////////////////// +int MemoryManager::EntryPresent(uint64_t CpuPtr) +{ + if(AccViewTable.empty()) return 0; + + auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1)); + return count; +} +void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + assert(!EntryPresent(CpuPtr)); + AcceleratorViewEntry AccCache; + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; + AccCache.LRU_valid=0; + AccCache.transient=0; + AccCache.accLock=0; + AccCache.cpuLock=0; + AccViewTable[CpuPtr] = AccCache; +} +MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) +{ + assert(EntryPresent(CpuPtr)); + auto AccCacheIterator = AccViewTable.find(CpuPtr); + assert(AccCacheIterator!=AccViewTable.end()); + return AccCacheIterator; +} +void MemoryManager::EntryErase(uint64_t CpuPtr) +{ + auto AccCache = EntryLookup(CpuPtr); + AccViewTable.erase(CpuPtr); +} +void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.LRU_valid==0); + if (AccCache.transient) { + LRU.push_back(AccCache.CpuPtr); + AccCache.LRU_entry = --LRU.end(); + } else { + LRU.push_front(AccCache.CpuPtr); + AccCache.LRU_entry = LRU.begin(); + } + AccCache.LRU_valid = 1; + DeviceLRUBytes+=AccCache.bytes; +} +void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.LRU_valid==1); + LRU.erase(AccCache.LRU_entry); + AccCache.LRU_valid = 0; + DeviceLRUBytes-=AccCache.bytes; +} +///////////////////////////////////////////////// +// Accelerator cache motion & consistency logic +///////////////////////////////////////////////// +void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) +{ + /////////////////////////////////////////////////////////// + // Remove from Accelerator, remove entry, without flush + // Cannot be locked. If allocated Must be in LRU pool. + /////////////////////////////////////////////////////////// + assert(AccCache.state!=Empty); + + // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + assert(AccCache.accLock==0); + assert(AccCache.cpuLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr) { + AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceBytes -=AccCache.bytes; + LRUremove(AccCache); + // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + } + uint64_t CpuPtr = AccCache.CpuPtr; + EntryErase(CpuPtr); +} + +void MemoryManager::Evict(AcceleratorViewEntry &AccCache) +{ + /////////////////////////////////////////////////////////////////////////// + // Make CPU consistent, remove from Accelerator, remove entry + // Cannot be locked. If allocated must be in LRU pool. + /////////////////////////////////////////////////////////////////////////// + assert(AccCache.state!=Empty); + + // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + assert(AccCache.accLock==0); + assert(AccCache.cpuLock==0); + if(AccCache.state==AccDirty) { + Flush(AccCache); + } + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr) { + AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceBytes -=AccCache.bytes; + LRUremove(AccCache); + // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + } + uint64_t CpuPtr = AccCache.CpuPtr; + EntryErase(CpuPtr); +} +void MemoryManager::Flush(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state==AccDirty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.AccPtr!=(uint64_t)NULL); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); + // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + DeviceToHostBytes+=AccCache.bytes; + DeviceToHostXfer++; + AccCache.state=Consistent; +} +void MemoryManager::Clone(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state==CpuDirty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr==(uint64_t)NULL){ + AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); + DeviceBytes+=AccCache.bytes; + } + // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); + HostToDeviceBytes+=AccCache.bytes; + HostToDeviceXfer++; + AccCache.state=Consistent; +} + +void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) +{ + assert(AccCache.state!=Empty); + assert(AccCache.cpuLock==0); + assert(AccCache.accLock==0); + assert(AccCache.CpuPtr!=(uint64_t)NULL); + if(AccCache.AccPtr==(uint64_t)NULL){ + AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); + DeviceBytes+=AccCache.bytes; + } + AccCache.state=AccDirty; +} + +///////////////////////////////////////////////////////////////////////////////// +// View management +///////////////////////////////////////////////////////////////////////////////// +void MemoryManager::ViewClose(void* Ptr,ViewMode mode) +{ + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + AcceleratorViewClose((uint64_t)Ptr); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + CpuViewClose((uint64_t)Ptr); + } else { + assert(0); + } +} +void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + uint64_t CpuPtr = (uint64_t)_CpuPtr; + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); + } else { + assert(0); + return NULL; + } +} +void MemoryManager::EvictVictims(uint64_t bytes) +{ + while(bytes+DeviceLRUBytes > DeviceMaxBytes){ + if ( DeviceLRUBytes > 0){ + assert(LRU.size()>0); + uint64_t victim = LRU.back(); + auto AccCacheIterator = EntryLookup(victim); + auto & AccCache = AccCacheIterator->second; + Evict(AccCache); + } + } +} +uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + if ( EntryPresent(CpuPtr)==0 ){ + EvictVictims(bytes); + EntryCreate(CpuPtr,bytes,mode,hint); + } + + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); + + assert(AccCache.cpuLock==0); // Programming error + + if(AccCache.state!=Empty) { + assert(AccCache.CpuPtr == CpuPtr); + assert(AccCache.bytes ==bytes); + } +/* + * State transitions and actions + * + * Action State StateNext Flush Clone + * + * AccRead Empty Consistent - Y + * AccWrite Empty AccDirty - Y + * AccRead CpuDirty Consistent - Y + * AccWrite CpuDirty AccDirty - Y + * AccRead Consistent Consistent - - + * AccWrite Consistent AccDirty - - + * AccRead AccDirty AccDirty - - + * AccWrite AccDirty AccDirty - - + */ + if(AccCache.state==Empty) { + assert(AccCache.LRU_valid==0); + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; // Cpu starts primary + if(mode==AcceleratorWriteDiscard){ + CpuDiscard(AccCache); + AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite){ + Clone(AccCache); + AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else { + Clone(AccCache); + AccCache.state = Consistent; // Empty + AccRead => Consistent + } + AccCache.accLock= 1; + } else if(AccCache.state==CpuDirty ){ + if(mode==AcceleratorWriteDiscard) { + CpuDiscard(AccCache); + AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite) { + Clone(AccCache); + AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else { + Clone(AccCache); + AccCache.state = Consistent; // CpuDirty + AccRead => Consistent + } + AccCache.accLock++; + // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + } else if(AccCache.state==Consistent) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty + else + AccCache.state = Consistent; // Consistent + AccRead => Consistent + AccCache.accLock++; + // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + } else if(AccCache.state==AccDirty) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty + else + AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty + AccCache.accLock++; + // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + } else { + assert(0); + } + + // If view is opened on device remove from LRU + if(AccCache.LRU_valid==1){ + // must possibly remove from LRU as now locked on GPU + LRUremove(AccCache); + } + + int transient =hint; + AccCache.transient= transient? EvictNext : 0; + + return AccCache.AccPtr; +} +//////////////////////////////////// +// look up & decrement lock count +//////////////////////////////////// +void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) +{ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert(AccCache.cpuLock==0); + assert(AccCache.accLock>0); + + AccCache.accLock--; + + // Move to LRU queue if not locked and close on device + if(AccCache.accLock==0) { + LRUinsert(AccCache); + } +} +void MemoryManager::CpuViewClose(uint64_t CpuPtr) +{ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert(AccCache.cpuLock>0); + assert(AccCache.accLock==0); + + AccCache.cpuLock--; +} +/* + * Action State StateNext Flush Clone + * + * CpuRead Empty CpuDirty - - + * CpuWrite Empty CpuDirty - - + * CpuRead CpuDirty CpuDirty - - + * CpuWrite CpuDirty CpuDirty - - + * CpuRead Consistent Consistent - - + * CpuWrite Consistent CpuDirty - - + * CpuRead AccDirty Consistent Y - + * CpuWrite AccDirty CpuDirty Y - + */ +uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient) +{ + //////////////////////////////////////////////////////////////////////////// + // Find if present, otherwise get or force an empty + //////////////////////////////////////////////////////////////////////////// + if ( EntryPresent(CpuPtr)==0 ){ + EvictVictims(bytes); + EntryCreate(CpuPtr,bytes,mode,transient); + } + + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + + assert((mode==CpuRead)||(mode==CpuWrite)); + assert(AccCache.accLock==0); // Programming error + + if(AccCache.state!=Empty) { + assert(AccCache.CpuPtr == CpuPtr); + assert(AccCache.bytes==bytes); + } + + if(AccCache.state==Empty) { + AccCache.CpuPtr = CpuPtr; + AccCache.AccPtr = (uint64_t)NULL; + AccCache.bytes = bytes; + AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty + AccCache.accLock= 0; + AccCache.cpuLock= 1; + } else if(AccCache.state==CpuDirty ){ + // AccPtr dont care, deferred allocate + AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty + AccCache.cpuLock++; + } else if(AccCache.state==Consistent) { + assert(AccCache.AccPtr != (uint64_t)NULL); + if(mode==CpuWrite) + AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty + else + AccCache.state = Consistent; // Consistent +CpuRead => Consistent + AccCache.cpuLock++; + } else if(AccCache.state==AccDirty) { + assert(AccCache.AccPtr != (uint64_t)NULL); + Flush(AccCache); + if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush + else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush + AccCache.cpuLock++; + } else { + assert(0); // should be unreachable + } + + AccCache.transient= transient? EvictNext : 0; + + return AccCache.CpuPtr; +} +void MemoryManager::NotifyDeletion(void *_ptr) +{ + // Look up in ViewCache + uint64_t ptr = (uint64_t)_ptr; + if(EntryPresent(ptr)) { + auto e = EntryLookup(ptr); + AccDiscard(e->second); + } +} +void MemoryManager::Print(void) +{ + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << "Memory Manager " << std::endl; + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; + std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; + std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; + std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; + std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; + std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; + std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; + std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; + std::cout << GridLogDebug << "--------------------------------------------" << std::endl; + std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + std::cout << GridLogDebug << "0x"<second; + return AccCache.cpuLock+AccCache.accLock; + } else { + return 0; + } +} + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc new file mode 100644 index 00000000..537f7c32 --- /dev/null +++ b/Grid/allocator/MemoryManagerShared.cc @@ -0,0 +1,24 @@ +#include +#ifdef GRID_UVM + +#warning "Grid is assuming unified virtual memory address space" +NAMESPACE_BEGIN(Grid); +///////////////////////////////////////////////////////////////////////////////// +// View management is 1:1 address space mapping +///////////////////////////////////////////////////////////////////////////////// +uint64_t MemoryManager::DeviceBytes; +uint64_t MemoryManager::DeviceLRUBytes; +uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128; +uint64_t MemoryManager::HostToDeviceBytes; +uint64_t MemoryManager::DeviceToHostBytes; +uint64_t MemoryManager::HostToDeviceXfer; +uint64_t MemoryManager::DeviceToHostXfer; + +void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; +void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; +int MemoryManager::isOpen (void* CpuPtr) { return 0;} +void MemoryManager::Print(void){}; +void MemoryManager::NotifyDeletion(void *ptr){}; + +NAMESPACE_END(Grid); +#endif diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc new file mode 100644 index 00000000..0d1707d9 --- /dev/null +++ b/Grid/allocator/MemoryStats.cc @@ -0,0 +1,67 @@ +#include +#include + +NAMESPACE_BEGIN(Grid); + +MemoryStats *MemoryProfiler::stats = nullptr; +bool MemoryProfiler::debug = false; + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + +std::string sizeString(const size_t bytes) +{ + constexpr unsigned int bufSize = 256; + const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; + char buf[256]; + size_t s = 0; + double count = bytes; + + while (count >= 1024 && s < 7) + { + s++; + count /= 1024; + } + if (count - floor(count) == 0.0) + { + snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); + } + else + { + snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); + } + + return std::string(buf); +} + +NAMESPACE_END(Grid); + diff --git a/Grid/allocator/MemoryStats.h b/Grid/allocator/MemoryStats.h new file mode 100644 index 00000000..156c9747 --- /dev/null +++ b/Grid/allocator/MemoryStats.h @@ -0,0 +1,95 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/MemoryStats.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + + +NAMESPACE_BEGIN(Grid); + +std::string sizeString(size_t bytes); + +struct MemoryStats +{ + size_t totalAllocated{0}, maxAllocated{0}, + currentlyAllocated{0}, totalFreed{0}; +}; + +class MemoryProfiler +{ +public: + static MemoryStats *stats; + static bool debug; +}; + +#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" +#define profilerDebugPrint \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ + << std::endl; \ + std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ + << std::endl; \ + } + +#define profilerAllocate(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalAllocated += (bytes); \ + s->currentlyAllocated += (bytes); \ + s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +#define profilerFree(bytes) \ + if (MemoryProfiler::stats) \ + { \ + auto s = MemoryProfiler::stats; \ + s->totalFreed += (bytes); \ + s->currentlyAllocated -= (bytes); \ + } \ + if (MemoryProfiler::debug) \ + { \ + std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \ + profilerDebugPrint; \ + } + +void check_huge_pages(void *Buf,uint64_t BYTES); + +NAMESPACE_END(Grid); + diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h index 87472cc9..ae1fd1fd 100644 --- a/Grid/cartesian/Cartesian_base.h +++ b/Grid/cartesian/Cartesian_base.h @@ -81,6 +81,7 @@ public: bool _isCheckerBoarded; int LocallyPeriodic; + Coordinate _checker_dim_mask; public: diff --git a/Grid/cartesian/Cartesian_full.h b/Grid/cartesian/Cartesian_full.h index c083817b..31a67bf0 100644 --- a/Grid/cartesian/Cartesian_full.h +++ b/Grid/cartesian/Cartesian_full.h @@ -38,6 +38,7 @@ class GridCartesian: public GridBase { public: int dummy; + Coordinate _checker_dim_mask; virtual int CheckerBoardFromOindexTable (int Oindex) { return 0; } @@ -104,6 +105,7 @@ public: _ldimensions.resize(_ndimension); _rdimensions.resize(_ndimension); _simd_layout.resize(_ndimension); + _checker_dim_mask.resize(_ndimension);; _lstart.resize(_ndimension); _lend.resize(_ndimension); @@ -114,6 +116,8 @@ public: for (int d = 0; d < _ndimension; d++) { + _checker_dim_mask[d]=0; + _fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions _simd_layout[d] = simd_layout[d]; diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index 34f763d2..b71981f5 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -35,12 +35,28 @@ static const int CbRed =0; static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; + +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +{ + int nd=rdim.size(); + Coordinate coor(nd); + + Lexicographic::CoorFromIndex(coor,oindex,rdim); + + int linear=0; + for(int d=0;d _checker_board; diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 0e525674..6130195d 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/communicator/Communicator_mpi.cc @@ -35,7 +35,7 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world; //////////////////////////////////////////// // First initialise of comms system //////////////////////////////////////////// -void CartesianCommunicator::Init(int *argc, char ***argv) +void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; @@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { - MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); +#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori + nCommThreads=1; + // wrong results here too + // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs + // other comms schemes are ok + MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); +#else + MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); +#endif //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { assert(0); @@ -91,7 +99,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) //////////////////////////////////////////////////////////////////////////////////////////////////////// // Initialises from communicator_world //////////////////////////////////////////////////////////////////////////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { MPI_Comm optimal_comm; //////////////////////////////////////////////////// @@ -110,7 +118,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) ////////////////////////////////// // Try to subdivide communicator ////////////////////////////////// -CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) +CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) { _ndimension = processors.size(); assert(_ndimension>=1); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); @@ -127,7 +135,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const ////////////////////////////////////////////////////////////////////////////////////////////////////// // split the communicator ////////////////////////////////////////////////////////////////////////////////////////////////////// - // int Nparent = parent._processors ; + // int Nparent = parent._processors ; int Nparent; MPI_Comm_size(parent.communicator,&Nparent); @@ -149,13 +157,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const } // rank within subcomm ; srank is rank of subcomm within blocks of subcomms - int crank; + int crank; // Mpi uses the reverse Lexico convention to us; so reversed routines called Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids MPI_Comm comm_split; - if ( Nchild > 1 ) { + if ( Nchild > 1 ) { //////////////////////////////////////////////////////////////// // Split the communicator @@ -180,11 +188,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const SetCommunicator(comm_split); /////////////////////////////////////////////// - // Free the temp communicator + // Free the temp communicator /////////////////////////////////////////////// MPI_Comm_free(&comm_split); - if(0){ + if(0){ std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; for(int d=0;d &lis int myrank = _processor; int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { MPI_Request xrq; MPI_Request rrq; ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - + assert(ierr==0); list.push_back(xrq); list.push_back(rrq); - } else { + } else { // Give the CPU to MPI immediately; can use threads to overlap optionally ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, recv,bytes,MPI_CHAR,from, from, @@ -367,7 +375,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list,dir); } @@ -436,8 +444,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) communicator); assert(ierr==0); } -int CartesianCommunicator::RankWorld(void){ - int r; +int CartesianCommunicator::RankWorld(void){ + int r; MPI_Comm_rank(communicator_world,&r); return r; } @@ -470,7 +478,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // (Turns up on 32^3 x 64 Gparity too) MPI_Datatype object; - int iwords; + int iwords; int ibytes; iwords = words; ibytes = bytes; @@ -483,5 +491,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t } NAMESPACE_END(Grid); - - diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index bd83ce52..45fefc71 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -29,7 +29,7 @@ Author: Peter Boyle #include #include -#ifdef GRID_NVCC +#ifdef GRID_CUDA #include #endif @@ -420,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_NVCC +#ifdef GRID_CUDA void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -440,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) ////////////////////////////////////////////////////////////////////////////////////////////////////////// // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); -#ifdef GRID_IBM_SUMMIT - // IBM Jsrun makes cuda Device numbering screwy and not match rank - std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"< NAMESPACE_BEGIN(Grid); +extern Vector > Cshift_table; + /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int e2=rhs.Grid()->_slice_block[dimension]; int ent = 0; - static Vector > table; table.resize(e1*e2); + if(Cshift_table.size()_slice_stride[dimension]; - auto rhs_v = rhs.View(); if ( cbmask == 0x3 ) { for(int n=0;n(off+bo+b,so+o+b); + Cshift_table[ent++] = std::pair(off+bo+b,so+o+b); } } } else { @@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int o = n*stride; int ocb=1<CheckerBoardFromOindex(o+b); if ( ocb &cbmask ) { - table[ent++]=std::pair (off+bo++,so+o+b); + Cshift_table[ent++]=std::pair (off+bo++,so+o+b); } } } } - thread_for(i,ent,{ - buffer[table[i].first]=rhs_v[table[i].second]; - }); + { + autoView(rhs_v , rhs, AcceleratorRead); + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); + } } /////////////////////////////////////////////////////////////////// @@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice &rhs, int e2=rhs.Grid()->_slice_block[dimension]; int n1=rhs.Grid()->_slice_stride[dimension]; - auto rhs_v = rhs.View(); if ( cbmask ==0x3){ - thread_for_collapse(2,n,e1,{ - for(int b=0;b(temp,pointers,offset); - } - }); + }); } else { + autoView(rhs_v , rhs, AcceleratorRead); - // Case of SIMD split AND checker dim cannot currently be hit, except in - // Test_cshift_red_black code. - std::cout << " Dense packed buffer WARNING " <_rdimensions; + Coordinate cdm =rhs.Grid()->_checker_dim_mask; + std::cout << " Dense packed buffer WARNING " <CheckerBoardFromOindex(o+b); + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); } - } - }); + }); } } @@ -145,7 +154,8 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_block[dimension]; int stride=rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size() void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension]; - table[ent++] = std::pair(so+o+b,bo+b); + Cshift_table[ent++] = std::pair(so+o+b,bo+b); } } @@ -165,16 +175,20 @@ template void Scatter_plane_simple (Lattice &rhs,commVector_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { - table[ent++]=std::pair (so+o+b,bo++); + Cshift_table[ent++]=std::pair (so+o+b,bo++); } } } } - auto rhs_v = rhs.View(); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer[table[i].second]; - }); + { + autoView( rhs_v, rhs, AcceleratorWrite); + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + rhs_v[table[i].first]=buffer_p[table[i].second]; + }); + } } ////////////////////////////////////////////////////// @@ -194,21 +208,19 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - auto rhs_v = rhs.View(); - thread_for_collapse(2,n,e1,{ - for(int b=0;b_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; merge(rhs_v[so+o+b],pointers,offset); - } - }); + }); } else { // Case of SIMD split AND checker dim cannot currently be hit, except in // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<_slice_stride[dimension]; @@ -225,6 +237,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// + template void Copy_plane(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -239,14 +252,16 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e2=rhs.Grid()->_slice_block[dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + + if(Cshift_table.size()(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } else { @@ -255,23 +270,24 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int o =n*stride+b; int ocb=1<CheckerBoardFromOindex(o); if ( ocb&cbmask ) { - table[ent++] = std::pair(lo+o,ro+o); + Cshift_table[ent++] = std::pair(lo+o,ro+o); } } } } - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for(i,ent,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); - + { + autoView(rhs_v , rhs, AcceleratorRead); + autoView(lhs_v , lhs, AcceleratorWrite); + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); + } } template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) { - int rd = rhs.Grid()->_rdimensions[dimension]; if ( !rhs.Grid()->CheckerBoarded(dimension) ) { @@ -285,29 +301,33 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_block [dimension]; int stride = rhs.Grid()->_slice_stride[dimension]; - static std::vector > table; table.resize(e1*e2); + if(Cshift_table.size()(lo+o+b,ro+o+b); + Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } else { for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) table[ent++] = std::pair(lo+o+b,ro+o+b); + if ( ocb&cbmask ) Cshift_table[ent++] = std::pair(lo+o+b,ro+o+b); }} } - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for(i,ent,{ - permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); - }); + { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); + }); + } } ////////////////////////////////////////////////////// diff --git a/Grid/cshift/Cshift_table.cc b/Grid/cshift/Cshift_table.cc new file mode 100644 index 00000000..d46e51c0 --- /dev/null +++ b/Grid/cshift/Cshift_table.cc @@ -0,0 +1,4 @@ +#include +NAMESPACE_BEGIN(Grid); +Vector > Cshift_table; +NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 036633b4..a3017198 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -26,6 +26,7 @@ Author: Peter Boyle *************************************************************************************/ /* END LEGAL */ #pragma once +#include #include #include #include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index da63d5e6..91b456d9 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -92,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView &arg) { return arg[ss]; } + +// What needs this? +// Cannot be legal on accelerator +// Comparison must convert +#if 1 template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { - auto view = arg.AcceleratorView(ViewRead); + auto view = arg.View(AcceleratorRead); return view[ss]; } +#endif /////////////////////////////////////////////////// // handle nodes in syntax tree- eval one operand @@ -180,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf cb = lat.Checkerboard(); } template ::value, T1>::type * = nullptr> -inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf -{ -} - +inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf template inline void CBFromExpression(int &cb,const LatticeUnaryExpression &expr) { CBFromExpression(cb, expr.arg1); // recurse AST } - template inline void CBFromExpression(int &cb,const LatticeBinaryExpression &expr) { @@ -204,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression::value, T1>::type * = nullptr> +inline void ExpressionViewOpen(T1 &lat) // Lattice leaf +{ + lat.ViewOpen(AcceleratorRead); +} +template ::value, T1>::type * = nullptr> + inline void ExpressionViewOpen(T1 ¬lat) {} + +template inline +void ExpressionViewOpen(LatticeUnaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST +} + +template inline +void ExpressionViewOpen(LatticeBinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST +} +template +inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) +{ + ExpressionViewOpen(expr.arg1); // recurse AST + ExpressionViewOpen(expr.arg2); // recurse AST + ExpressionViewOpen(expr.arg3); // recurse AST +} + +////////////////////////////////////////////////////////////////////////// +// ViewClose +////////////////////////////////////////////////////////////////////////// +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose( T1 &lat) // Lattice leaf +{ + lat.ViewClose(); +} +template ::value, T1>::type * = nullptr> +inline void ExpressionViewClose(T1 ¬lat) {} + +template inline +void ExpressionViewClose(LatticeUnaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST +} +template inline +void ExpressionViewClose(LatticeBinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST +} +template +inline void ExpressionViewClose(LatticeTrinaryExpression &expr) +{ + ExpressionViewClose(expr.arg1); // recurse AST + ExpressionViewClose(expr.arg2); // recurse AST + ExpressionViewClose(expr.arg3); // recurse AST +} + //////////////////////////////////////////// // Unary operators and funcs //////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index c4a67620..a3ae1f28 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -56,9 +56,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -73,9 +73,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -89,9 +89,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -108,8 +108,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -121,8 +121,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -135,8 +135,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -148,8 +148,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto lhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -165,8 +165,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -179,8 +179,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -193,8 +193,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -206,8 +206,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto rhs_v = lhs.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -221,9 +221,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto x_v = x.AcceleratorView(ViewRead); - auto y_v = y.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -234,9 +234,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.AcceleratorView(ViewWrite); - auto x_v = x.AcceleratorView(ViewRead); - auto y_v = y.AcceleratorView(ViewRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 42e9e50a..73b1b6a1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + #pragma once #define STREAMING_STORES @@ -37,161 +38,6 @@ NAMESPACE_BEGIN(Grid); extern int GridCshiftPermuteMap[4][16]; -/////////////////////////////////////////////////////////////////// -// Base class which can be used by traits to pick up behaviour -/////////////////////////////////////////////////////////////////// -class LatticeBase {}; - -///////////////////////////////////////////////////////////////////////////////////////// -// Conformable checks; same instance of Grid required -///////////////////////////////////////////////////////////////////////////////////////// -void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) -{ - assert(lhs == rhs); -} - -//////////////////////////////////////////////////////////////////////////// -// Advise the LatticeAccelerator class -//////////////////////////////////////////////////////////////////////////// -enum LatticeAcceleratorAdvise { - AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can - // significantly influence performance of bulk storage. - AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures - // enables read-only copies of memory to be kept on - // host and device. -}; - -//////////////////////////////////////////////////////////////////////////// -// View Access Mode -//////////////////////////////////////////////////////////////////////////// -enum ViewMode { - ViewRead = 0x1, - ViewWrite = 0x2, - ViewReadWrite = 0x3 -}; - -//////////////////////////////////////////////////////////////////////////// -// Minimal base class containing only data valid to access from accelerator -// _odata will be a managed pointer in CUDA -//////////////////////////////////////////////////////////////////////////// -// Force access to lattice through a view object. -// prevents writing of code that will not offload to GPU, but perhaps annoyingly -// strict since host could could in principle direct access through the lattice object -// Need to decide programming model. -#define LATTICE_VIEW_STRICT -template class LatticeAccelerator : public LatticeBase -{ -protected: - GridBase *_grid; - int checkerboard; - vobj *_odata; // A managed pointer - uint64_t _odata_size; -public: - accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; - accelerator_inline uint64_t oSites(void) const { return _odata_size; }; - accelerator_inline int Checkerboard(void) const { return checkerboard; }; - accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view - accelerator_inline void Conformable(GridBase * &grid) const - { - if (grid) conformable(grid, _grid); - else grid = _grid; - }; - - accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future - gridAcceleratorPrefetch(_odata,_odata_size*sizeof(vobj)); - }; - - accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - //cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); -#endif -#endif - }; -}; - -///////////////////////////////////////////////////////////////////////////////////////// -// A View class which provides accessor to the data. -// This will be safe to call from accelerator_for and is trivially copy constructible -// The copy constructor for this will need to be used by device lambda functions -///////////////////////////////////////////////////////////////////////////////////////// -template -class LatticeView : public LatticeAccelerator -{ -public: - - - // Rvalue -#ifdef __CUDA_ARCH__ - accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } -#else - accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } -#endif - - accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; - accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - - accelerator_inline uint64_t begin(void) const { return 0;}; - accelerator_inline uint64_t end(void) const { return this->_odata_size; }; - accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - - LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me) - { - } -}; - -///////////////////////////////////////////////////////////////////////////////////////// -// Lattice expression types used by ET to assemble the AST -// -// Need to be able to detect code paths according to the whether a lattice object or not -// so introduce some trait type things -///////////////////////////////////////////////////////////////////////////////////////// - -class LatticeExpressionBase {}; - -template using is_lattice = std::is_base_of; -template using is_lattice_expr = std::is_base_of; - -template struct ViewMapBase { typedef T Type; }; -template struct ViewMapBase { typedef LatticeView Type; }; -template using ViewMap = ViewMapBase::value >; - -template -class LatticeUnaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - Op op; - T1 arg1; - LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {}; -}; - -template -class LatticeBinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - Op op; - T1 arg1; - T2 arg2; - LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {}; -}; - -template -class LatticeTrinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - typedef typename ViewMap<_T3>::Type T3; - Op op; - T1 arg1; - T2 arg2; - T3 arg3; - LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {}; -}; - ///////////////////////////////////////////////////////////////////////////////////////// // The real lattice class, with normal copy and assignment semantics. // This contains extra (host resident) grid pointer data that may be accessed by host code @@ -235,44 +81,25 @@ private: } public: - void Advise(int advise) { -#ifdef GRID_NVCC -#ifndef __CUDA_ARCH__ // only on host - if (advise & AdviseInfrequentUse) { - gridMoveToHost((void**)&this->_odata); - } - if (advise & AdviseReadMostly) { - //cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); - } -#endif -#endif - }; - + ///////////////////////////////////////////////////////////////////////////////// + // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents + ///////////////////////////////////////////////////////////////////////////////// + void SetViewMode(ViewMode mode) { + LatticeView accessor(*( (LatticeAccelerator *) this),mode); + accessor.ViewClose(); + } ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const // deprecated, should pick AcceleratorView for accelerator_for - { // and HostView for thread_for - LatticeView accessor(*( (LatticeAccelerator *) this)); + + LatticeView View (ViewMode mode) const + { + LatticeView accessor(*( (LatticeAccelerator *) this),mode); return accessor; } - LatticeView AcceleratorView(int mode = ViewReadWrite) const - { - LatticeView accessor(*( (LatticeAccelerator *) this)); - //accessor.AcceleratorPrefetch(mode); - return accessor; - } - - LatticeView HostView(int mode = ViewReadWrite) const - { - LatticeView accessor(*( (LatticeAccelerator *) this)); - //accessor.HostPrefetch(mode); - return accessor; - } - ~Lattice() { if ( this->_odata_size ) { dealloc(); @@ -292,12 +119,16 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - - auto me = AcceleratorView(ViewWrite); + + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeBinaryExpression &expr) @@ -312,11 +143,15 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = AcceleratorView(ViewWrite); + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) @@ -330,11 +165,15 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = AcceleratorView(ViewWrite); + auto exprCopy = expr; + ExpressionViewOpen(exprCopy); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,expr); + auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); + ExpressionViewClose(exprCopy); return *this; } //GridFromExpression is tricky to do @@ -385,10 +224,11 @@ public: } template inline Lattice & operator = (const sobj & r){ - auto me = View(); + auto me = View(CpuWrite); thread_for(ss,me.size(),{ - me[ss] = r; + me[ss]= r; }); + me.ViewClose(); return *this; } @@ -398,11 +238,12 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); this->checkerboard=0; + SetViewMode(mode); } // virtual ~Lattice(void) = default; @@ -440,11 +281,12 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = AcceleratorView(ViewWrite); - auto him= r.AcceleratorView(ViewRead); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } @@ -454,11 +296,12 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = AcceleratorView(ViewWrite); - auto him= r.AcceleratorView(ViewRead); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } /////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index f1126936..b930e018 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -51,34 +51,39 @@ template void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) { typedef decltype(basis[0]) Field; - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].AcceleratorView(ViewReadWrite); - Vector basis_v(basis.size(),tmp_v); - typedef typename std::remove_reference::type vobj; + typedef decltype(basis[0].View(AcceleratorRead)) View; + + Vector basis_v; basis_v.reserve(basis.size()); + typedef typename std::remove_reference::type vobj; + typedef typename std::remove_reference::type Coeff_t; GridBase* grid = basis[0].Grid(); for(int k=0;k Bt(Nm * max_threads); thread_region - { - std::vector < vobj > B(Nm); // Thread private - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(),{ + for(int j=j0; j Qt_jv(Nm*Nm); - double *Qt_p = & Qt_jv[0]; + Vector Qt_jv(Nm*Nm); + Coeff_t *Qt_p = & Qt_jv[0]; thread_for(i,Nm*Nm,{ int j = i/Nm; int k = i%Nm; Qt_p[i]=Qt(j,k); - }); + }); // Block the loop to keep storage footprint down for(uint64_t s=0;s void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { - typedef decltype(basis[0].AcceleratorView()) View; + typedef decltype(basis[0].View(AcceleratorRead)) View; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.AcceleratorView(ViewWrite); - Vector basis_v(basis.size(),result_v); + + Vector basis_v; basis_v.reserve(basis.size()); for(int k=0;k Qt_jv(Nm); double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ auto B=coalescedRead(zz); for(int k=k0; k &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); + for(int k=0;k diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h index bbed2ef5..6a29be94 100644 --- a/Grid/lattice/Lattice_comparison.h +++ b/Grid/lattice/Lattice_comparison.h @@ -78,9 +78,9 @@ template inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); }); @@ -93,8 +93,8 @@ template inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, lhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs); }); @@ -107,8 +107,8 @@ template inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs,rhs_v[ss]); }); diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index a1abe58d..cd0f11ee 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,7 +37,7 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - auto l_v = l.View(); + autoView(l_v, l, CpuWrite); thread_for( o, grid->oSites(), { vector_type vI; Coordinate gcoor; @@ -51,23 +51,5 @@ template inline void LatticeCoordinate(Lattice &l,int mu) }); }; -// LatticeCoordinate(); -// FIXME for debug; deprecate this; made obscelete by -template void lex_sites(Lattice &l){ - auto l_v = l.View(); - Real *v_ptr = (Real *)&l_v[0]; - size_t o_len = l.Grid()->oSites(); - size_t v_len = sizeof(vobj)/sizeof(vRealF); - size_t vec_len = vRealF::Nsimd(); - - for(int i=0;i inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); }); @@ -56,9 +56,9 @@ template inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); }); @@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Latt typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(rr())) srr; Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),1,{ // FIXME had issues with scalar version of outer // Use vector [] operator and don't read coalesce this loop diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 0980ad8a..7c470fef 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto Y_v = Y.View(); - auto R_v = R.View(); + autoView( X_v , X, CpuRead); + autoView( Y_v , Y, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { std::vector s_x(Nblock); @@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto R_v = R.View(); + autoView( X_v , X, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { @@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + autoView( lhs_v , lhs, CpuRead); + autoView( rhs_v , rhs, CpuRead); thread_region { std::vector Left(Nblock); std::vector Right(Nblock); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index 8f649bd7..c79becf2 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -46,9 +46,9 @@ auto PeekIndex(const Lattice &lhs,int i) -> Lattice(vobj(),i))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + accelerator_for( ss, lhs_v.size(), 1, { ret_v[ss] = peekIndex(lhs_v[ss],i); }); return ret; @@ -58,9 +58,9 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(vobj(),i,j))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + accelerator_for( ss, lhs_v.size(), 1, { ret_v[ss] = peekIndex(lhs_v[ss],i,j); }); return ret; @@ -72,18 +72,18 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice void PokeIndex(Lattice &lhs,const Lattice(vobj(),0))> & rhs,int i) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + accelerator_for( ss, lhs_v.size(), 1, { pokeIndex(lhs_v[ss],rhs_v[ss],i); }); } template void PokeIndex(Lattice &lhs,const Lattice(vobj(),0,0))> & rhs,int i,int j) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); - thread_for( ss, lhs_v.size(), { + autoView( rhs_v, rhs, AcceleratorRead); + autoView( lhs_v, lhs, AcceleratorWrite); + accelerator_for( ss, lhs_v.size(), 1, { pokeIndex(lhs_v[ss],rhs_v[ss],i,j); }); } @@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ // extract-modify-merge cycle is easiest way and this is not perf critical ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + autoView( l_v , l, CpuWrite); if ( rank == grid->ThisRank() ) { extract(l_v[odx],buf); buf[idx] = s; @@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + autoView( l_v , l, CpuWrite); extract(l_v[odx],buf); s = buf[idx]; @@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ return; }; - ////////////////////////////////////////////////////////// // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// +// Must be CPU read view template -inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ - - GridBase *grid = l.Grid(); - +inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) +{ + GridBase *grid = l.getGrid(); + assert(l.mode==CpuRead); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site){ return; }; - +// Must be CPU write view template -inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ - - GridBase *grid=l.Grid(); +inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) +{ + GridBase *grid=l.getGrid(); + assert(l.mode==CpuWrite); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -202,13 +202,11 @@ inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + ret.Checkerboard()=lhs.Checkerboard(); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], adj(lhs_v(ss))); }); @@ -51,9 +53,11 @@ template inline Lattice adj(const Lattice &lhs){ template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + ret.Checkerboard() = lhs.Checkerboard(); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3147823d..c2955485 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -25,7 +25,7 @@ Author: Christoph Lehner #include -#ifdef GRID_NVCC +#if defined(GRID_CUDA)||defined(GRID_HIP) #include #endif @@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) { typedef typename vobj::scalar_object sobj; - const int Nsimd = vobj::Nsimd(); + // const int Nsimd = vobj::Nsimd(); + const int nthread = GridThread::GetThreads(); + + Vector sumarray(nthread); + for(int i=0;i +inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) +{ + typedef typename vobj::scalar_objectD sobj; + const int nthread = GridThread::GetThreads(); Vector sumarray(nthread); @@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) ssum = ssum+sumarray[i]; } - return ssum; + typedef typename vobj::scalar_object ssobj; + ssobj ret = ssum; + return ret; } + + template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#ifdef GRID_NVCC +#if defined(GRID_CUDA)||defined(GRID_HIP) return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); #endif } +template +inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) +{ +#if defined(GRID_CUDA)||defined(GRID_HIP) + return sumD_gpu(arg,osites); +#else + return sumD_cpu(arg,osites); +#endif +} + template inline typename vobj::scalar_object sum(const Lattice &arg) { - auto arg_v = arg.View(); +#if defined(GRID_CUDA)||defined(GRID_HIP) + autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum(&arg_v[0],osites); + auto ssum= sum_gpu(&arg_v[0],osites); +#else + autoView(arg_v, arg, CpuRead); + Integer osites = arg.Grid()->oSites(); + auto ssum= sum_cpu(&arg_v[0],osites); +#endif arg.Grid()->GlobalSum(ssum); return ssum; } @@ -101,43 +150,30 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & ComplexD nrm; GridBase *grid = left.Grid(); - - // Might make all code paths go this way. - auto left_v = left.AcceleratorView(ViewRead); - auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC - // GPU - SIMT lane compliance... - typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + // Might make all code paths go this way. + typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - + + { + autoView( left_v , left, AcceleratorRead); + autoView( right_v,right, AcceleratorRead); - accelerator_for( ss, sites, nsimd,{ - auto x_l = left_v(ss); - auto y_l = right_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); - }) + // GPU - SIMT lane compliance... + accelerator_for( ss, sites, 1,{ + auto x_l = left_v[ss]; + auto y_l = right_v[ss]; + inner_tmp_v[ss]=innerProductD(x_l,y_l); + }); + } // This is in single precision and fails some tests - // Need a sumD that sums in double - nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites)); -#else - // CPU - typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto x_l = left_v[ss]; - auto y_l = right_v[ss]; - inner_tmp_v[ss]=innerProductD(x_l,y_l); - }) - nrm = TensorRemove(sum(inner_tmp_v,sites)); -#endif + auto anrm = sum(inner_tmp_v,sites); + nrm = anrm; return nrm; } @@ -175,40 +211,24 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.AcceleratorView(ViewRead); - auto y_v=y.AcceleratorView(ViewRead); - auto z_v=z.AcceleratorView(ViewWrite); - const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC // GPU - typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); - }); - - nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites))); -#else - // CPU typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); + + accelerator_for( ss, sites, 1,{ + auto tmp = a*x_v[ss]+b*y_v[ss]; inner_tmp_v[ss]=innerProductD(tmp,tmp); z_v[ss]=tmp; }); - // Already promoted to double nrm = real(TensorRemove(sum(inner_tmp_v,sites))); -#endif grid->GlobalSum(nrm); return nrm; } @@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti GridBase *grid = left.Grid(); - auto left_v=left.AcceleratorView(ViewRead); - auto right_v=right.AcceleratorView(ViewRead); - const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); -#ifdef GRID_NVCC // GPU - typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; - typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; + typedef decltype(innerProductD(vobj(),vobj())) inner_t; + typedef decltype(innerProductD(vobj(),vobj())) norm_t; Vector inner_tmp(sites); - Vector norm_tmp(sites); + Vector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; + { + autoView(left_v,left, AcceleratorRead); + autoView(right_v,right,AcceleratorRead); + accelerator_for( ss, sites, 1,{ + auto left_tmp = left_v[ss]; + inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]); + norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp); + }); + } - accelerator_for( ss, sites, nsimd,{ - auto left_tmp = left_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); - coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); - }); - - tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites)); - tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites)); -#else - // CPU - typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; - typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t; - Vector inner_tmp(sites); - Vector norm_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - auto norm_tmp_v = &norm_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto left_tmp = left_v(ss); - inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss)); - norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp); - }); - // Already promoted to double tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); -#endif + grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector ip = tmp[0]; nrm = real(tmp[1]); @@ -335,7 +337,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction - auto Data_v=Data.View(); + autoView( Data_v, Data, CpuRead); thread_for( r,rd, { int so=r*grid->_ostride[orthogdim]; // base offset for start of plane for(int n=0;n & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - auto lhv=lhs.View(); - auto rhv=rhs.View(); + autoView( lhv, lhs, CpuRead); + autoView( rhv, rhs, CpuRead); thread_for( r,rd,{ int so=r*grid->_ostride[orthogdim]; // base offset for start of plane @@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice tensor_reduced at; at=av; - auto Rv=R.View(); - auto Xv=X.View(); - auto Yv=Y.View(); - thread_for_collapse(2, n, e1, { - for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v=X.View(); - auto Y_v=Y.View(); - auto R_v=R.View(); + autoView( X_v, X, CpuRead); + autoView( Y_v, Y, CpuRead); + autoView( R_v, R, CpuWrite); thread_region { Vector s_x(Nblock); @@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< // int nl=1; //FIXME package in a convenient iterator + // thread_for2d_in_region //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto R_v = R.View(); - auto X_v = X.View(); + autoView( R_v, R, CpuWrite); + autoView( X_v, X, CpuRead); thread_region { std::vector s_x(Nblock); @@ -692,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v=lhs.View(); - auto rhs_v=rhs.View(); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); thread_region { std::vector Left(Nblock); diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c5d75356..5f490507 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -1,7 +1,13 @@ NAMESPACE_BEGIN(Grid); -#define WARP_SIZE 32 +#ifdef GRID_HIP +extern hipDeviceProp_t *gpu_props; +#endif +#ifdef GRID_CUDA extern cudaDeviceProp *gpu_props; +#endif + +#define WARP_SIZE 32 __device__ unsigned int retirementCount = 0; template @@ -19,7 +25,12 @@ template void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { int device; +#ifdef GRID_CUDA cudaGetDevice(&device); +#endif +#ifdef GRID_HIP + hipGetDevice(&device); +#endif Iterator warpSize = gpu_props[device].warpSize; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; @@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { sobj *smem = (sobj *)shmem_pointer; // wait until all outstanding memory instructions in this thread are finished - __threadfence(); + acceleratorFence(); if (tid==0) { unsigned int ticket = atomicInc(&retirementCount, gridDim.x); @@ -156,8 +167,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { } // each thread must read the correct value of amLast - __syncthreads(); - + acceleratorSynchroniseAll(); + if (amLast) { // reduce buffer[0], ..., buffer[gridDim.x-1] Iterator i = tid; @@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) sobj *buffer_v = &buffer[0]; reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); - cudaDeviceSynchronize(); - - cudaError err = cudaGetLastError(); - if ( cudaSuccess != err ) { - printf("Cuda error %s\n",cudaGetErrorString( err )); - exit(0); - } + accelerator_barrier(); auto result = buffer_v[0]; return result; } diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 1bb1f087..e5e63716 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -375,7 +375,7 @@ public: int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int words = sizeof(scalar_object) / sizeof(scalar_type); - auto l_v = l.View(); + autoView(l_v, l, CpuWrite); thread_for( ss, osites, { ExtractBuffer buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times @@ -461,8 +461,8 @@ public: } { - // Obtain one reseeded generator per thread - int Nthread = GridThread::GetThreads(); + // Obtain one reseeded generator per thread + int Nthread = 32; // Hardwire a good level or parallelism std::vector seeders(Nthread); for(int t=0;t inline auto trace(const Lattice &lhs) -> Lattice { Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView(ret_v , ret, AcceleratorWrite); + autoView(lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], trace(lhs_v(ss))); }); @@ -58,8 +58,8 @@ template inline auto TraceIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], traceIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 435b7851..beceecc9 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine) //////////////////////////////////////////////////////////////////////////////////////////// // remove and insert a half checkerboard //////////////////////////////////////////////////////////////////////////////////////////// -template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ +template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full) +{ half.Checkerboard() = cb; - auto half_v = half.View(); - auto full_v = full.View(); + autoView( half_v, half, CpuWrite); + autoView( full_v, full, CpuRead); thread_for(ss, full.Grid()->oSites(),{ int cbos; Coordinate coor; @@ -64,11 +65,11 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } - -template inline void setCheckerboard(Lattice &full,const Lattice &half){ +template inline void setCheckerboard(Lattice &full,const Lattice &half) +{ int cb = half.Checkerboard(); - auto half_v = half.View(); - auto full_v = full.View(); + autoView( half_v , half, CpuRead); + autoView( full_v , full, CpuWrite); thread_for(ss,full.Grid()->oSites(),{ Coordinate coor; @@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & out = in; } -#ifdef GRID_NVCC +#ifdef GRID_SIMT accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { - ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; + ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; } accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { - ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; + ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in; } accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { - ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; + ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in; } #endif @@ -151,12 +152,11 @@ accelerator_inline void convertType(T & out, const T & in) { template accelerator_inline void convertType(Lattice & out, const Lattice & in) { - auto out_v = out.AcceleratorView(ViewWrite); - auto in_v = in.AcceleratorView(ViewRead); - + autoView( out_v , out,AcceleratorWrite); + autoView( in_v , in ,AcceleratorRead); accelerator_for(ss,out_v.size(),T1::Nsimd(),{ convertType(out_v[ss],in_v(ss)); - }); + }); } //////////////////////////////////////////////////////////////////////////////////////////// @@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice & out, const Lattice & in) { //////////////////////////////////////////////////////////////////////////////////////////// template inline auto localInnerProductD(const Lattice &lhs,const Lattice &rhs) --> Lattice> +-> Lattice> { - auto lhs_v = lhs.AcceleratorView(ViewRead); - auto rhs_v = rhs.AcceleratorView(ViewRead); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; Lattice> ret(lhs.Grid()); - auto ret_v = ret.AcceleratorView(ViewWrite); - accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ + { + autoView(ret_v, ret,AcceleratorWrite); + accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); }); - + } return ret; } @@ -194,14 +195,13 @@ inline void blockProject(Lattice > &coarseData, Lattice> ip(coarse); Lattice fineDataRed = fineData; - // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.AcceleratorView(ViewWrite); - auto ip_ = ip.AcceleratorView(ViewReadWrite); + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( ip_ , ip, AcceleratorWrite); for(int v=0;v accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { convertType(coarseData_[sc](v),ip_[sc]); - }); + }); // improve numerical stability of projection // |fine> = |fine> - |basis> @@ -210,68 +210,6 @@ inline void blockProject(Lattice > &coarseData, } } -template -inline void blockProject1(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) -{ - typedef iVector coarseSiteData; - coarseSiteData elide; - typedef decltype(coalescedRead(elide)) ScalarComplex; - GridBase * fine = fineData.Grid(); - GridBase * coarse= coarseData.Grid(); - int _ndimension = coarse->_ndimension; - - // checks - assert( nbasis == Basis.size() ); - subdivides(coarse,fine); - for(int i=0;i_rdimensions[d] / coarse->_rdimensions[d]; - assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); - } - int blockVol = fine->oSites()/coarse->oSites(); - - coarseData=Zero(); - - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - //////////////////////////////////////////////////////////////////////////////////////////////////////// - // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. - // Otherwise do fine inner product per site, and make the update atomic - //////////////////////////////////////////////////////////////////////////////////////////////////////// - accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { - - auto sc=sci/nbasis; - auto i=sci%nbasis; - auto Basis_ = Basis[i].View(); - - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - - int sf; - decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); - - for(int sb=0;sb_rdimensions); - - reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); - } - coalescedWrite(coarseData_[sc](i),reduce); - }); - return; -} template inline void blockZAXPY(Lattice &fineZ, @@ -298,10 +236,10 @@ template assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.AcceleratorView(ViewWrite); - auto fineX_ = fineX.AcceleratorView(ViewRead); - auto fineY_ = fineY.AcceleratorView(ViewRead); - auto coarseA_= coarseA.AcceleratorView(ViewRead); + autoView( fineZ_ , fineZ, AcceleratorWrite); + autoView( fineX_ , fineX, AcceleratorRead); + autoView( fineY_ , fineY, AcceleratorRead); + autoView( coarseA_, coarseA, AcceleratorRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { @@ -314,7 +252,7 @@ template Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT typename vobj2::tensor_reduced::scalar_object cA; typename vobj::scalar_object cAx; #else @@ -344,15 +282,16 @@ template Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); Lattice coarse_inner(coarse); - auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); - auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); - // Precision promotion - fine_inner = localInnerProductD(fineX,fineY); + fine_inner = localInnerProductD(fineX,fineY); blockSum(coarse_inner,fine_inner); - accelerator_for(ss, coarse->oSites(), 1, { + { + autoView( CoarseInner_ , CoarseInner,AcceleratorWrite); + autoView( coarse_inner_ , coarse_inner,AcceleratorRead); + accelerator_for(ss, coarse->oSites(), 1, { convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); }); + } } @@ -370,14 +309,15 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); - auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); - fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - accelerator_for(ss, coarse->oSites(), 1, { - convertType(CoarseInner_[ss],coarse_inner_[ss]); + { + autoView( CoarseInner_ , CoarseInner, AcceleratorWrite); + autoView( coarse_inner_ , coarse_inner, AcceleratorRead); + accelerator_for(ss, coarse->oSites(), 1, { + CoarseInner_[ss] = coarse_inner_[ss]; }); + } } template @@ -408,8 +348,10 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) } int blockVol = fine->oSites()/coarse->oSites(); - auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); - auto fineData_ = fineData.AcceleratorView(ViewRead); + // Turn this around to loop threaded over sc and interior loop + // over sf would thread better + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( fineData_ , fineData, AcceleratorRead); accelerator_for(sc,coarse->oSites(),1,{ @@ -510,8 +452,8 @@ inline void blockPromote(const Lattice > &coarseData, for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); + autoView( fineData_ , fineData, AcceleratorWrite); + autoView( coarseData_ , coarseData, AcceleratorRead); // Loop with a cache friendly loop ordering accelerator_for(sf,fine->oSites(),1,{ @@ -524,7 +466,7 @@ inline void blockPromote(const Lattice > &coarseData, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); for(int i=0;i > &coarseData, fineData=Zero(); for(int i=0;i > ip = PeekIndex<0>(coarseData,i); - auto ip_ = ip.AcceleratorView(ViewRead); + + //Lattice cip(coarse); + //autoView( cip_ , cip, AcceleratorWrite); + //autoView( ip_ , ip, AcceleratorRead); + //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ + // coalescedWrite(cip_[sc], ip_(sc)()); + // }); + //blockZAXPY(fineData,cip,Basis[i],fineData); blockZAXPY(fineData,ip,Basis[i],fineData); } } @@ -571,15 +520,17 @@ void localConvert(const Lattice &in,Lattice &out) assert(ig->lSites() == og->lSites()); } + autoView(in_v,in,CpuRead); + autoView(out_v,out,CpuWrite); thread_for(idx, ig->lSites(),{ sobj s; ssobj ss; Coordinate lcoor(ni); ig->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(s,in,lcoor); + peekLocalSite(s,in_v,lcoor); ss=s; - pokeLocalSite(ss,out,lcoor); + pokeLocalSite(ss,out_v,lcoor); }); } @@ -614,8 +565,9 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.AcceleratorView(ViewWrite); - auto f_v = From.AcceleratorView(ViewRead); + + autoView( t_v , To, AcceleratorWrite); + autoView( f_v , From, AcceleratorRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); @@ -638,8 +590,6 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int w=0;w &lowDim,Lattice & higherDim,int slice } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -682,8 +634,8 @@ void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); }); } @@ -711,6 +663,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -723,8 +677,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); }); } @@ -752,6 +706,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -760,8 +716,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); } }); } @@ -789,6 +745,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -797,8 +755,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); } }); } @@ -862,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) } //loop over outer index - auto in_v = in.View(); + autoView( in_v , in, CpuRead); thread_for(in_oidx,in_grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray out_ptrs(in_nsimd); @@ -955,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) icoor[lane].resize(ndim); grid->iCoorFromIindex(icoor[lane],lane); } - auto out_v = out.View(); + autoView( out_v , out, CpuWrite); thread_for(oidx, grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray ptrs(nsimd); @@ -1058,7 +1016,7 @@ void precisionChange(Lattice &out, const Lattice &in) std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); - auto out_v = out.View(); + autoView( out_v , out, CpuWrite); thread_for(out_oidx,out_grid->oSites(),{ Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h index 6fe08c10..adfe3380 100644 --- a/Grid/lattice/Lattice_transpose.h +++ b/Grid/lattice/Lattice_transpose.h @@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid); template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); }); @@ -58,8 +58,8 @@ template inline auto TransposeIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss] , transposeIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_unary.h b/Grid/lattice/Lattice_unary.h index 591afe72..07424b3d 100644 --- a/Grid/lattice/Lattice_unary.h +++ b/Grid/lattice/Lattice_unary.h @@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid); template Lattice pow(const Lattice &rhs_i,RealD y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs, rhs_i, AcceleratorRead); + autoView( ret, ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),1,{ ret[ss]=pow(rhs[ss],y); @@ -45,8 +45,8 @@ template Lattice pow(const Lattice &rhs_i,RealD y){ } template Lattice mod(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],mod(rhs(ss),y)); @@ -56,8 +56,8 @@ template Lattice mod(const Lattice &rhs_i,Integer y){ template Lattice div(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto ret = ret_i.View(); - auto rhs = rhs_i.View(); + autoView( ret , ret_i, AcceleratorWrite); + autoView( rhs , rhs_i, AcceleratorRead); ret.Checkerboard() = rhs_i.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],div(rhs(ss),y)); @@ -67,8 +67,8 @@ template Lattice div(const Lattice &rhs_i,Integer y){ template Lattice expMat(const Lattice &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h new file mode 100644 index 00000000..3b76b921 --- /dev/null +++ b/Grid/lattice/Lattice_view.h @@ -0,0 +1,168 @@ +#pragma once +NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////////////////// +// Base class which can be used by traits to pick up behaviour +/////////////////////////////////////////////////////////////////// +class LatticeBase {}; + +///////////////////////////////////////////////////////////////////////////////////////// +// Conformable checks; same instance of Grid required +///////////////////////////////////////////////////////////////////////////////////////// +void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) +{ + assert(lhs == rhs); +} + +//////////////////////////////////////////////////////////////////////////// +// Minimal base class containing only data valid to access from accelerator +// _odata will be a managed pointer in CUDA +//////////////////////////////////////////////////////////////////////////// +// Force access to lattice through a view object. +// prevents writing of code that will not offload to GPU, but perhaps annoyingly +// strict since host could could in principle direct access through the lattice object +// Need to decide programming model. +#define LATTICE_VIEW_STRICT +template class LatticeAccelerator : public LatticeBase +{ +protected: + //public: + GridBase *_grid; + int checkerboard; + vobj *_odata; // A managed pointer + uint64_t _odata_size; + ViewAdvise advise; +public: + accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { }; + accelerator_inline uint64_t oSites(void) const { return _odata_size; }; + accelerator_inline int Checkerboard(void) const { return checkerboard; }; + accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view + accelerator_inline ViewAdvise Advise(void) const { return advise; }; + accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view + accelerator_inline void Conformable(GridBase * &grid) const + { + if (grid) conformable(grid, _grid); + else grid = _grid; + }; + // Host only + GridBase * getGrid(void) const { return _grid; }; +}; + +///////////////////////////////////////////////////////////////////////////////////////// +// A View class which provides accessor to the data. +// This will be safe to call from accelerator_for and is trivially copy constructible +// The copy constructor for this will need to be used by device lambda functions +///////////////////////////////////////////////////////////////////////////////////////// +template +class LatticeView : public LatticeAccelerator +{ +public: + // Rvalue + ViewMode mode; + void * cpu_ptr; +#ifdef GRID_SIMT + accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { + return coalescedRead(this->_odata[i]); + } +#else + accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } +#endif + + accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; + + accelerator_inline uint64_t begin(void) const { return 0;}; + accelerator_inline uint64_t end(void) const { return this->_odata_size; }; + accelerator_inline uint64_t size(void) const { return this->_odata_size; }; + + LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} + LatticeView(const LatticeView &refer_to_me) = default; // Trivially copyable + LatticeView(const LatticeAccelerator &refer_to_me,ViewMode mode) : LatticeAccelerator (refer_to_me) + { + this->ViewOpen(mode); + } + + // Host functions + void ViewOpen(ViewMode mode) + { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base + // std::cout << "View Open"<_odata<cpu_ptr = (void *)this->_odata; + this->mode = mode; + this->_odata =(vobj *) + MemoryManager::ViewOpen(this->cpu_ptr, + this->_odata_size*sizeof(vobj), + mode, + this->advise); + } + void ViewClose(void) + { // Inform the manager + // std::cout << "View Close"<cpu_ptr<cpu_ptr,this->mode); + } + +}; +// Little autoscope assister +template +class ViewCloser +{ + View v; // Take a copy of view and call view close when I go out of scope automatically + public: + ViewCloser(View &_v) : v(_v) {}; + ~ViewCloser() { v.ViewClose(); } +}; + +#define autoView(l_v,l,mode) \ + auto l_v = l.View(mode); \ + ViewCloser _autoView##l_v(l_v); + +///////////////////////////////////////////////////////////////////////////////////////// +// Lattice expression types used by ET to assemble the AST +// +// Need to be able to detect code paths according to the whether a lattice object or not +// so introduce some trait type things +///////////////////////////////////////////////////////////////////////////////////////// + +class LatticeExpressionBase {}; + +template using is_lattice = std::is_base_of; +template using is_lattice_expr = std::is_base_of; + +template struct ViewMapBase { typedef T Type; }; +template struct ViewMapBase { typedef LatticeView Type; }; +template using ViewMap = ViewMapBase::value >; + +template +class LatticeUnaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + Op op; + T1 arg1; + LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {}; +}; + +template +class LatticeBinaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + typedef typename ViewMap<_T2>::Type T2; + Op op; + T1 arg1; + T2 arg2; + LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {}; +}; + +template +class LatticeTrinaryExpression : public LatticeExpressionBase +{ +public: + typedef typename ViewMap<_T1>::Type T1; + typedef typename ViewMap<_T2>::Type T2; + typedef typename ViewMap<_T3>::Type T3; + Op op; + T1 arg1; + T2 arg2; + T3 arg3; + LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {}; +}; +NAMESPACE_END(Grid); diff --git a/Grid/perfmon/PerfCount.h b/Grid/perfmon/PerfCount.h index 1e2a9528..dd25b41e 100644 --- a/Grid/perfmon/PerfCount.h +++ b/Grid/perfmon/PerfCount.h @@ -44,7 +44,7 @@ Author: paboyle #include #endif #ifdef __x86_64__ -#ifdef GRID_NVCC +#ifdef GRID_CUDA accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; } #else @@ -112,7 +112,6 @@ class PerformanceCounter { private: typedef struct { - public: uint32_t type; uint64_t config; const char *name; diff --git a/Grid/pugixml/pugixml.cc b/Grid/pugixml/pugixml.cc index e7b395ad..45e6496a 100644 --- a/Grid/pugixml/pugixml.cc +++ b/Grid/pugixml/pugixml.cc @@ -12773,7 +12773,7 @@ namespace pugi #undef PUGI__THROW_ERROR #undef PUGI__CHECK_ERROR -#ifdef GRID_NVCC +#ifdef GRID_CUDA #pragma pop #endif diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h index 890c680b..0c8a0930 100644 --- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h +++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h @@ -114,19 +114,22 @@ public: U = adj(Cshift(U, mu, -1)); PokeIndex(Uadj, U, mu); } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + + autoView(Umu_v,Umu,CpuRead); + autoView(Uadj_v,Uadj,CpuRead); + autoView(Uds_v,Uds,CpuWrite); + thread_for( lidx, GaugeGrid->lSites(), { Coordinate lcoor; GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUmu, Umu, lcoor); + peekLocalSite(ScalarUmu, Umu_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - peekLocalSite(ScalarUmu, Uadj, lcoor); + peekLocalSite(ScalarUmu, Uadj_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - pokeLocalSite(ScalarUds, Uds, lcoor); - } + pokeLocalSite(ScalarUds, Uds_v, lcoor); + }); } inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index fb6f18bb..16252340 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); +#include #include #include NAMESPACE_CHECK(Staggered); @@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionD; +typedef NaiveStaggeredFermion NaiveStaggeredFermionR; +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; -#ifndef GRID_NVCC +#ifndef GRID_CUDA typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b147b3f..0b726db9 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -96,11 +96,11 @@ public: int sl = St._simd_layout[direction]; Coordinate icoor; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT _Spinor tmp; const int Nsimd =SiteDoubledGaugeField::Nsimd(); - int s = SIMTlane(Nsimd); + int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; @@ -232,15 +232,17 @@ public: if ( Params.twists[mu] ) { Uconj = where(coor==neglink,-Uconj,Uconj); } - - auto U_v = U.View(); - auto Uds_v = Uds.View(); - auto Uconj_v = Uconj.View(); - auto Utmp_v= Utmp.View(); - thread_foreach(ss,U_v,{ - Uds_v[ss](0)(mu) = U_v[ss](); - Uds_v[ss](1)(mu) = Uconj_v[ss](); - }); + + { + autoView( U_v , U, CpuRead); + autoView( Uconj_v , Uconj, CpuRead); + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,U_v,{ + Uds_v[ss](0)(mu) = U_v[ss](); + Uds_v[ss](1)(mu) = Uconj_v[ss](); + }); + } U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary Uconj = adj(Cshift(Uconj,mu,-1)); @@ -250,19 +252,25 @@ public: Utmp = where(coor==0,Uconj,Utmp); } - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](0)(mu+4) = Utmp_v[ss](); - }); - + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](0)(mu+4) = Utmp_v[ss](); + }); + } Utmp = Uconj; if ( Params.twists[mu] ) { Utmp = where(coor==0,U,Utmp); } - - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](1)(mu+4) = Utmp_v[ss](); - }); - + + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](1)(mu+4) = Utmp_v[ss](); + }); + } } } @@ -272,11 +280,14 @@ public: GaugeLinkField link(mat.Grid()); // use lorentz for flavour as hack. auto tmp = TraceIndex(outerProduct(Btilde, A)); - auto link_v = link.View(); - auto tmp_v = tmp.View(); - thread_foreach(ss,tmp_v,{ - link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); - }); + + { + autoView( link_v , link, CpuWrite); + autoView( tmp_v , tmp, CpuRead); + thread_foreach(ss,tmp_v,{ + link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); + }); + } PokeIndex(mat, link, mu); return; } @@ -306,16 +317,18 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Atilde_v = Atilde.View(); - auto Btilde_v = Btilde.View(); - thread_for(ss,tmp.Grid()->oSites(),{ - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); - tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - }); + { + autoView( tmp_v , tmp, CpuWrite); + autoView( Atilde_v , Atilde, CpuRead); + autoView( Btilde_v , Btilde, CpuRead); + thread_for(ss,tmp.Grid()->oSites(),{ + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); + tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + }); + } PokeIndex(mat, tmp, mu); return; } diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index d1bb0e9c..625eda63 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -61,8 +61,8 @@ public: double DhopCalls; double DhopCommTime; double DhopComputeTime; - double DhopComputeTime2; - double DhopFaceTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h new file mode 100644 index 00000000..ca38a64f --- /dev/null +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -0,0 +1,194 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_QCD_NAIVE_STAG_FERMION_H +#define GRID_QCD_NAIVE_STAG_FERMION_H + +NAMESPACE_BEGIN(Grid); + +class NaiveStaggeredFermionStatic { +public: + static const std::vector directions; + static const std::vector displacements; + static const int npoint = 8; +}; + +template +class NaiveStaggeredFermion : public StaggeredKernels, public NaiveStaggeredFermionStatic { +public: + INHERIT_IMPL_TYPES(Impl); + typedef StaggeredKernels Kernels; + + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + + /////////////////////////////////////////////////////////////// + // Implement the abstract base + /////////////////////////////////////////////////////////////// + GridBase *GaugeGrid(void) { return _grid; } + GridBase *GaugeRedBlackGrid(void) { return _cbgrid; } + GridBase *FermionGrid(void) { return _grid; } + GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + + ////////////////////////////////////////////////////////////////// + // override multiply; cut number routines if pass dagger argument + // and also make interface more uniformly consistent + ////////////////////////////////////////////////////////////////// + void M(const FermionField &in, FermionField &out); + void Mdag(const FermionField &in, FermionField &out); + + ///////////////////////////////////////////////////////// + // half checkerboard operations + ///////////////////////////////////////////////////////// + void Meooe(const FermionField &in, FermionField &out); + void MeooeDag(const FermionField &in, FermionField &out); + void Mooee(const FermionField &in, FermionField &out); + void MooeeDag(const FermionField &in, FermionField &out); + void MooeeInv(const FermionField &in, FermionField &out); + void MooeeInvDag(const FermionField &in, FermionField &out); + + //////////////////////// + // Derivative interface + //////////////////////// + // Interface calls an internal routine + void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); + + /////////////////////////////////////////////////////////////// + // non-hermitian hopping term; half cb or both + /////////////////////////////////////////////////////////////// + void Dhop (const FermionField &in, FermionField &out, int dag); + void DhopOE(const FermionField &in, FermionField &out, int dag); + void DhopEO(const FermionField &in, FermionField &out, int dag); + + /////////////////////////////////////////////////////////////// + // Multigrid assistance; force term uses too + /////////////////////////////////////////////////////////////// + void Mdir(const FermionField &in, FermionField &out, int dir, int disp); + void MdirAll(const FermionField &in, std::vector &out); + void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); + + /////////////////////////////////////////////////////////////// + // Extra methods added by derived + /////////////////////////////////////////////////////////////// + void DerivInternal(StencilImpl &st, + DoubledGaugeField &U, + GaugeField &mat, + const FermionField &A, const FermionField &B, int dag); + + void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// + NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + NaiveStaggeredFermion(GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p = ImplParams()); + + // DoubleStore impl dependent + void ImportGauge (const GaugeField &_U ); + DoubledGaugeField &GetU(void) { return Umu ; } ; + void CopyGaugeCheckerboards(void); + + /////////////////////////////////////////////////////////////// + // Data members require to support the functionality + /////////////////////////////////////////////////////////////// + + // protected: +public: + // any other parameters of action ??? + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } + RealD mass; + RealD u0; + RealD c1; + + GridBase *_grid; + GridBase *_cbgrid; + + // Defines the stencils for even and odd + StencilImpl Stencil; + StencilImpl StencilEven; + StencilImpl StencilOdd; + + // Copy of the gauge field , with even and odd subsets + DoubledGaugeField Umu; + DoubledGaugeField UmuEven; + DoubledGaugeField UmuOdd; + + LebesgueOrder Lebesgue; + LebesgueOrder LebesgueEvenOdd; + + /////////////////////////////////////////////////////////////// + // Conserved current utilities + /////////////////////////////////////////////////////////////// + void ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu); + void SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &srct, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx); +}; + +typedef NaiveStaggeredFermion NaiveStaggeredFermionF; +typedef NaiveStaggeredFermion NaiveStaggeredFermionD; + +NAMESPACE_END(Grid); + +#endif diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index 6ef0ab9d..30deee06 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -47,23 +47,34 @@ template class StaggeredKernels : public FermionOperator , pub INHERIT_IMPL_TYPES(Impl); typedef FermionOperator Base; -public: - - void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, - int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + public: + + void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); + + void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, + int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); + protected: /////////////////////////////////////////////////////////////////////////////////////// // Generic Nc kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -71,15 +82,18 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + template accelerator_inline + void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); @@ -87,27 +101,10 @@ public: /////////////////////////////////////////////////////////////////////////////////////// // Asm Nc=3 specific kernels /////////////////////////////////////////////////////////////////////////////////////// - void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, + void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, SiteSpinor * buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out,int dag); - /////////////////////////////////////////////////////////////////////////////////////////////////// - // Generic interface; fan out to right routine - /////////////////////////////////////////////////////////////////////////////////////////////////// - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1); - - void DhopSite(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor * buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior); public: diff --git a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h index 2d4de18e..18fe993c 100644 --- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h +++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h @@ -113,20 +113,7 @@ public: inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) { - GridBase *GaugeGrid = U_ds.Grid(); - thread_for(lidx, GaugeGrid->lSites(),{ - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - Coordinate lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, U_ds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - }); + assert(0); } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 4b25d00e..91ad6d6d 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -257,15 +257,16 @@ private: CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO + public: // eventually these can be compressed into 6x6 blocks instead of the 12x12 // using the DeGrand-Rossi basis for the gamma matrices CloverFieldType fillCloverYZ(const GaugeLinkField &F) { CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); @@ -281,9 +282,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v, T,AcceleratorWrite); + autoView(F_v, F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()(); @@ -299,9 +300,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()()); @@ -317,9 +318,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T, AcceleratorWrite); + autoView( F_v , F, AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()()); @@ -335,9 +336,9 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v ,T,AcceleratorWrite); + autoView( F_v ,F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()()); @@ -354,9 +355,9 @@ private: T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); - thread_for(i, CloverTerm.Grid()->oSites(), + autoView( T_v , T,AcceleratorWrite); + autoView( F_v , F,AcceleratorRead); + accelerator_for(i, CloverTerm.Grid()->oSites(),1, { T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 1c4dd3cf..bf8926d0 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -50,14 +50,14 @@ public: double, nu); WilsonAnisotropyCoefficients(): - isAnisotropic(false), - t_direction(Nd-1), - xi_0(1.0), + isAnisotropic(false), + t_direction(Nd-1), + xi_0(1.0), nu(1.0){} }; template -class WilsonFermion : public WilsonKernels, public WilsonFermionStatic +class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { public: INHERIT_IMPL_TYPES(Impl); @@ -74,6 +74,20 @@ public: FermionField _tmp; FermionField &tmp(void) { return _tmp; } + void Report(void); + void ZeroCounters(void); + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + double DhopTotalTime; + + double DerivCalls; + double DerivCommTime; + double DerivComputeTime; + double DerivDhopComputeTime; + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent @@ -138,7 +152,7 @@ public: // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p = ImplParams(), + const ImplParams &p = ImplParams(), const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() ); // DoubleStore impl dependent @@ -170,9 +184,9 @@ public: LebesgueOrder Lebesgue; LebesgueOrder LebesgueEvenOdd; - + WilsonAnisotropyCoefficients anisotropyCoeff; - + /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// @@ -186,7 +200,7 @@ public: PropagatorField &q_out, PropagatorField &phys_src, Current curr_type, - unsigned int mu, + unsigned int mu, unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx); @@ -196,5 +210,3 @@ typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; NAMESPACE_END(Grid); - - diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index e78023cf..52e1ee00 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,10 +106,10 @@ public: const _SpinorField & phi, int mu) { - auto out_v= out.View(); - auto phi_v= phi.View(); - auto Umu_v= Umu.View(); - thread_for(sss,out.Grid()->oSites(),{ + autoView( out_v, out, AcceleratorWrite); + autoView( phi_v, phi, AcceleratorRead); + autoView( Umu_v, Umu, AcceleratorRead); + accelerator_for(sss,out.Grid()->oSites(),1,{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); }); } @@ -191,18 +191,19 @@ public: int Ls=Btilde.Grid()->_fdimensions[0]; GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Btilde_v = Btilde.View(); - auto Atilde_v = Atilde.View(); - thread_for(sss,tmp.Grid()->oSites(),{ - int sU=sss; - for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here - } - }); + { + autoView( tmp_v , tmp, AcceleratorWrite); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,tmp.Grid()->oSites(),1,{ + int sU=sss; + for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here + } + }); + } PokeIndex(mat,tmp,mu); - } }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e9675b36..e79b64dc 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -180,7 +180,7 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; -#ifdef GRID_NVCC +#ifdef GRID_CUDA RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index dbdf134b..d2537ccf 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -50,9 +50,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -93,9 +93,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -131,8 +131,8 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); int Ls=this->Ls; @@ -193,8 +193,8 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); int Ls=this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); auto plee = & lee [0]; auto pdee = & dee [0]; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h index 034ce642..b54f63ad 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h @@ -65,9 +65,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi, psi_i,CpuRead); + autoView(phi, phi_i,CpuRead); + autoView(chi, chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd= Simd::Nsimd(); @@ -213,9 +213,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi=psi_i.View(); - auto phi=phi_i.View(); - auto chi=chi_i.View(); + autoView(psi,psi_i,CpuRead); + autoView(phi,phi_i,CpuRead); + autoView(chi,chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd= Simd::Nsimd(); @@ -357,8 +357,8 @@ CayleyFermion5D::MooeeInternalAsm(const FermionField &psi_i, FermionField Vector > &Matm) { EnableIf sfinae=0; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); #ifndef AVX512 { SiteHalfSpinor BcastP; @@ -535,8 +535,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField EnableIf sfinae=0; #ifndef AVX512 { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -586,8 +586,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField } #else { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); // pointers // MASK_REGS; #define Chi_00 %zmm0 diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 46d3fa1f..9a8454ef 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -46,9 +46,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; GridBase* grid = psi_i.Grid(); - auto phi = phi_i.View(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView( phi , phi_i, AcceleratorRead); + autoView( psi , psi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -82,9 +82,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio GridBase* grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView( psi , psi_i, AcceleratorRead); + autoView( phi , phi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -116,8 +116,8 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi=psi_i.View(); - auto chi=chi_i.View(); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; @@ -172,8 +172,8 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 44a201c1..888691c4 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionFi Compressor compressor; Stencil.HaloExchange(in,compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( ss,Umu.Grid()->oSites(),{ for(int s=0;s::DhopInternal(StencilImpl & st, LebesgueOr DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } @@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { -#ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; @@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DhopFaceTime-=usecond(); st.Prepare(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - double ctime=0; - double ptime=0; - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Remove explicit thread mapping introduced for OPA reasons. ////////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + DhopComputeTime-=usecond(); { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - if (tid >= ncomms) { - double start = usecond(); - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = U.Grid()->oSites(); // 4d vol - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<--------- - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------ - } - } - ptime = usecond() - start; - } else { - double start = usecond(); - st.CommunicateThreaded(); - ctime = usecond() - start; - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } - DhopCommTime += ctime; - DhopComputeTime+=ptime; - - // First to enter, last to leave timing - st.CollateThreads(); + DhopComputeTime+=usecond(); DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - DhopComputeTime2-=usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<---------- - }); - } else { - int sz=st.surface_list.size(); - thread_for( ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<---------- - }); + DhopComputeTime2-=usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2+=usecond(); -#else - assert(0); -#endif - } template @@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, Compressor compressor; int LLs = in.Grid()->_rdimensions[0]; - - //double t1=usecond(); DhopTotalTime -= usecond(); DhopCommTime -= usecond(); @@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v); - }); - } else { - thread_for( ss,U.Grid()->oSites(),{ - int sU=ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); - //double t2=usecond(); - //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; - //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; } /*CHANGE END*/ diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 57f4cb89..05d9a17e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -258,10 +258,10 @@ void ImprovedStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGauge //////////////////////// // Call the single hop //////////////////////// - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto B_v = B.View(); - auto Btilde_v = Btilde.View(); + autoView( U_v , U, CpuRead); + autoView( UUU_v , UUU, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); thread_for(sss,B.Grid()->oSites(),{ Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); }); @@ -386,10 +386,10 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel Compressor compressor; Stencil.HaloExchange(in, compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( sss, in.Grid()->oSites(),{ Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); }); @@ -403,11 +403,9 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); else -#endif DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); } template @@ -417,7 +415,6 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st const FermionField &in, FermionField &out, int dag) { -#ifdef GRID_OMP Compressor compressor; int len = U.Grid()->oSites(); @@ -426,60 +423,30 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopFaceTime -= usecond(); st.Prepare(); st.HaloGather(in,compressor); - st.CommsMergeSHM(compressor); DhopFaceTime += usecond(); + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + ////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ugly explicit thread mapping introduced for OPA reasons. + // Removed explicit thread comms ////////////////////////////////////////////////////////////////////////////////////////////////////// DhopComputeTime -= usecond(); -#pragma omp parallel { - int tid = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - int ncomms = CartesianCommunicator::nCommThreads; - if (ncomms == -1) ncomms = 1; - assert(nthreads > ncomms); - - if (tid >= ncomms) { - nthreads -= ncomms; - int ttid = tid - ncomms; - int n = len; - int chunk = n / nthreads; - int rem = n % nthreads; - int myblock, myn; - if (ttid < rem) { - myblock = ttid * chunk + ttid; - myn = chunk+1; - } else { - myblock = ttid*chunk + rem; - myn = chunk; - } - - // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - for (int ss = myblock; ss < myblock+myn; ++ss) { - int sU = ss; - // Interior = 1; Exterior = 0; must implement for staggered - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } else { - for (int ss = myblock; ss < myblock+myn; ++ss) { - // Interior = 1; Exterior = 0; - int sU = ss; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0); - } - } - } else { - st.CommunicateThreaded(); - } + int interior=1; + int exterior=0; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + // First to enter, last to leave timing DhopFaceTime -= usecond(); st.CommsMerge(compressor); @@ -487,28 +454,11 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopComputeTime2 -= usecond(); { - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); - if (dag == DaggerYes) { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } else { - int sz=st.surface_list.size(); - thread_for(ss,sz,{ - int sU = st.surface_list[ss]; - Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1); - }); - } + int interior=0; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime2 += usecond(); -#else - assert(0); -#endif } @@ -528,19 +478,11 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le st.HaloExchange(in, compressor); DhopCommTime += usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); DhopComputeTime -= usecond(); - if (dag == DaggerYes) { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); - } else { - thread_for(sss, in.Grid()->oSites(),{ - Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); - }); + { + int interior=1; + int exterior=1; + Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); } DhopComputeTime += usecond(); DhopTotalTime += usecond(); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index f74c7a51..41b9170d 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -44,9 +44,9 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -84,9 +84,9 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator @@ -132,9 +132,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -174,9 +174,9 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm GridBase *grid = psi_i.Grid(); int Ls = this->Ls; int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -226,8 +226,8 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -286,8 +286,8 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; auto plee = & this->lee [0]; @@ -354,8 +354,8 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -410,8 +410,8 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; auto pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h new file mode 100644 index 00000000..788e02cf --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -0,0 +1,499 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#pragma once + +NAMESPACE_BEGIN(Grid); + +///////////////////////////////// +// Constructor and gauge import +///////////////////////////////// + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, + RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : Kernels(p), + _grid(&Fgrid), + _cbgrid(&Hgrid), + Stencil(&Fgrid, npoint, Even, directions, displacements,p), + StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even + StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd + mass(_mass), + Lebesgue(_grid), + LebesgueEvenOdd(_cbgrid), + Umu(&Fgrid), + UmuEven(&Hgrid), + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ + int vol4; + int LLs=1; + c1=_c1; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} + +template +NaiveStaggeredFermion::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _u0, + const ImplParams &p) + : NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p) +{ + ImportGauge(_U); +} + +//////////////////////////////////////////////////////////// +// Momentum space propagator should be +// https://arxiv.org/pdf/hep-lat/9712010.pdf +// +// mom space action. +// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m +// +// must track through staggered flavour/spin reduction in literature to +// turn to free propagator for the one component chi field, a la page 4/5 +// of above link to implmement fourier based solver. +//////////////////////////////////////////////////////////// + +template +void NaiveStaggeredFermion::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); +} +template +void NaiveStaggeredFermion::ImportGauge(const GaugeField &_U) +{ + GaugeLinkField U(GaugeGrid()); + DoubledGaugeField _UUU(GaugeGrid()); + //////////////////////////////////////////////////////// + // Double Store should take two fields for Naik and one hop separately. + // Discard teh Naik as Naive + //////////////////////////////////////////////////////// + Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U ); + + //////////////////////////////////////////////////////// + // Apply scale factors to get the right fermion Kinetic term + // Could pass coeffs into the double store to save work. + // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) ) + //////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + U = PeekIndex(Umu, mu); + PokeIndex(Umu, U*( 0.5*c1/u0), mu ); + + U = PeekIndex(Umu, mu+4); + PokeIndex(Umu, U*(-0.5*c1/u0), mu+4); + + } + + CopyGaugeCheckerboards(); +} + +///////////////////////////// +// Implement the interface +///////////////////////////// + +template +void NaiveStaggeredFermion::M(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerNo); + axpy(out, mass, in, out); +} + +template +void NaiveStaggeredFermion::Mdag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Dhop(in, out, DaggerYes); + axpy(out, mass, in, out); +} + +template +void NaiveStaggeredFermion::Meooe(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } +} +template +void NaiveStaggeredFermion::MeooeDag(const FermionField &in, FermionField &out) { + if (in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } +} + +template +void NaiveStaggeredFermion::Mooee(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + typename FermionField::scalar_type scal(mass); + out = scal * in; +} + +template +void NaiveStaggeredFermion::MooeeDag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + Mooee(in, out); +} + +template +void NaiveStaggeredFermion::MooeeInv(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + out = (1.0 / (mass)) * in; +} + +template +void NaiveStaggeredFermion::MooeeInvDag(const FermionField &in, FermionField &out) +{ + out.Checkerboard() = in.Checkerboard(); + MooeeInv(in, out); +} + +/////////////////////////////////// +// Internal +/////////////////////////////////// + +template +void NaiveStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, + GaugeField & mat, + const FermionField &A, const FermionField &B, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + Compressor compressor; + + FermionField Btilde(B.Grid()); + FermionField Atilde(B.Grid()); + Atilde = A; + + st.HaloExchange(B, compressor); + + for (int mu = 0; mu < Nd; mu++) { + + //////////////////////// + // Call the single hop + //////////////////////// + autoView( U_v , U, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); + thread_for(sss,B.Grid()->oSites(),{ + Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); + }); + + assert(0);// need to figure out the force interface with a blasted three link term. + + } +} + +template +void NaiveStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _grid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + mat.Checkerboard() = U.Checkerboard(); + + DerivInternal(Stencil, Umu, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Even); + assert(U.Checkerboard() == Odd); + mat.Checkerboard() = Odd; + + DerivInternal(StencilEven, UmuOdd, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { + + conformable(U.Grid(), _cbgrid); + conformable(U.Grid(), V.Grid()); + conformable(U.Grid(), mat.Grid()); + + assert(V.Checkerboard() == Odd); + assert(U.Checkerboard() == Even); + mat.Checkerboard() = Even; + + DerivInternal(StencilOdd, UmuEven, mat, U, V, dag); +} + +template +void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=2; + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); +} + +template +void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) +{ + DhopCalls+=1; + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); +} + +template +void NaiveStaggeredFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +{ + DhopDir(in, out, dir, disp); +} +template +void NaiveStaggeredFermion::MdirAll(const FermionField &in, std::vector &out) +{ + assert(0); // Not implemented yet +} + +template +void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +{ + + Compressor compressor; + Stencil.HaloExchange(in, compressor); + autoView( Umu_v , Umu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); + // thread_for( sss, in.Grid()->oSites(),{ + // Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); + // }); + assert(0); +}; + + +template +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else + DhopInternalSerialComms(st,lo,U,in,out,dag); +} +template +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + Compressor compressor; + int len = U.Grid()->oSites(); + + DhopTotalTime -= usecond(); + + DhopFaceTime -= usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + DhopFaceTime += usecond(); + + DhopCommTime -=usecond(); + std::vector > requests; + st.CommunicateBegin(requests); + + DhopFaceTime-=usecond(); + st.CommsMergeSHM(compressor); + DhopFaceTime+= usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Removed explicit thread comms + ////////////////////////////////////////////////////////////////////////////////////////////////////// + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=0; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + + st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + // First to enter, last to leave timing + DhopFaceTime -= usecond(); + st.CommsMerge(compressor); + DhopFaceTime -= usecond(); + + DhopComputeTime2 -= usecond(); + { + int interior=0; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime2 += usecond(); +} + +template +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) +{ + assert((dag == DaggerNo) || (dag == DaggerYes)); + + DhopTotalTime -= usecond(); + + DhopCommTime -= usecond(); + Compressor compressor; + st.HaloExchange(in, compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + { + int interior=1; + int exterior=1; + Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); +}; + + //////////////////////////////////////////////////////////////// + // Reporting + //////////////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::Report(void) +{ + Coordinate latt = _grid->GlobalDimensions(); + RealD volume = 1; for(int mu=0;mu_Nprocessors; + RealD NN = _grid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" < +void NaiveStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + +//////////////////////////////////////////////////////// +// Conserved current - not yet implemented. +//////////////////////////////////////////////////////// +template +void NaiveStaggeredFermion::ContractConservedCurrent(PropagatorField &q_in_1, + PropagatorField &q_in_2, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu) +{ + assert(0); +} + +template +void NaiveStaggeredFermion::SeqConservedCurrent(PropagatorField &q_in, + PropagatorField &q_out, + PropagatorField &src, + Current curr_type, + unsigned int mu, + unsigned int tmin, + unsigned int tmax, + ComplexField &lattice_cmplx) +{ + assert(0); + +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h index 1a13e73a..63fd2a2f 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h @@ -618,10 +618,10 @@ Author: paboyle NAMESPACE_BEGIN(Grid); template -void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { assert(0); @@ -680,12 +680,13 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge3 =(uint64_t)&UU[sU]( T ); + // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -702,9 +703,10 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { #ifdef AVX512 @@ -756,8 +758,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl // This is the single precision 5th direction vectorised kernel #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -841,9 +844,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s void StaggeredKernels::DhopSiteAsm(StencilImpl &st, } #include -template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, +template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { #ifdef AVX512 @@ -910,9 +913,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, StencilEntry *SE2; StencilEntry *SE3; - for(int s=0;s -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -181,8 +182,9 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); @@ -202,7 +205,7 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); - + } if ( dag ) { result()()(0) = - even_0 - odd_0; result()()(1) = - even_1 - odd_1; @@ -218,9 +221,10 @@ void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -253,8 +257,9 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); @@ -277,7 +283,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); - + } // Assume every site must be connected to at least one interior point. No 1^4 subvols. if ( dag ) { result()()(0) = - even_0 - odd_0; @@ -294,9 +300,10 @@ void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template -void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { typedef typename Simd::scalar_type S; @@ -329,8 +336,9 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, StencilEntry *SE; int skew; - for(int s=0;s::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); + if (Naik) { skew = 8; HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); @@ -353,7 +362,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); - + } // Add sum of all exterior connected stencil legs if ( nmu ) { if ( dag ) { @@ -370,6 +379,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, } } +/* #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ @@ -385,7 +395,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ SiteSpinor *buf, int LLs, int sU, \ const FermionFieldView &in, FermionFieldView &out, int dag); \ - +*/ #undef LOAD_CHI NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index d301556c..141725a7 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else { \ chi_p = &buf[SE->_offset]; \ @@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid); if (SE->_is_local ) { \ if (SE->_permute) { \ chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ + permute(chi, in[SE->_offset], ptype); \ } else { \ - chi_p = &in[SE->_offset]; \ + chi_p = &in[SE->_offset]; \ } \ } else if ( st.same_node[Dir] ) { \ chi_p = &buf[SE->_offset]; \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ @@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid); if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], *chi_p, Dir); \ } template @@ -78,10 +78,12 @@ StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; // Int, Ext, Int+Ext cases for comms overlap //////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, int dag) { + SiteSpinor *buf, int sF, int sU, + const FermionFieldView &in, FermionFieldView &out, int dag) +{ const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; @@ -89,8 +91,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, int ptype; int skew; - for(int s=0;s::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); @@ -109,6 +114,7 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -120,9 +126,10 @@ void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, // Only contributions from interior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; SiteSpinor chi; @@ -131,8 +138,9 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & int ptype; int skew ; - for(int s=0;s::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); @@ -152,6 +161,7 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); + } if ( dag ) { Uchi = - Uchi; } @@ -164,9 +174,10 @@ void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder & // Only contributions from exterior of our node /////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, +template +void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, + SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out,int dag) { const SiteSpinor *chi_p; // SiteSpinor chi; @@ -176,8 +187,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & int nmu=0; int skew ; - for(int s=0;s::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); + if ( Naik ) { skew=8; GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); @@ -197,7 +210,7 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); - + } if ( nmu ) { if ( dag ) { out[sF] = out[sF] - Uchi; @@ -211,72 +224,9 @@ void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder & //////////////////////////////////////////////////////////////////////////////////// // Driving / wrapping routine to select right kernel //////////////////////////////////////////////////////////////////////////////////// - template -void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=1; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionFieldView &in, FermionFieldView &out, - int interior,int exterior) -{ - int dag=0; - DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); -}; - -template -void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionFieldView &in, FermionFieldView &out, - int dag,int interior,int exterior) -{ - switch(Opt) { -#ifdef AVX512 - case OptInlineAsm: - if ( interior && exterior ) { - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); - } else { - std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi assert(0); } +#define KERNEL_CALLNB(A,improved) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel:: template A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier(); + +#define ASM_CALL(A) \ + const uint64_t NN = Nsite*Ls; \ + thread_for( ss, NN, { \ + int sF = ss; \ + int sU = ss/Ls; \ + ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ + }); + +template +void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + autoView( UUU_v , UUU, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} + if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} +#endif + } + assert(0 && " Kernel optimisation case not covered "); +} +template +void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag, int interior,int exterior) +{ + GridBase *FGrid=in.Grid(); + GridBase *UGrid=U.Grid(); + typedef StaggeredKernels ThisKernel; + autoView( UUU_v , U, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + SiteSpinor * buf = st.CommBuf(); + + int Ls=1; + if(FGrid->Nd()==UGrid->Nd()+1){ + Ls = FGrid->_rdimensions[0]; + } + int Nsite = UGrid->oSites(); + + if( interior && exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} +#endif + } else if( interior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} +#endif + } else if( exterior ) { + if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} +#ifndef GRID_CUDA + if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} +#endif + } +} + + +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 36447153..df1bce7c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -98,32 +98,35 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) Coordinate lcoor; typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - for (int site = 0; site < lvol; site++) { - grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - peekLocalSite(Qx, CloverTerm, lcoor); - Qxinv = Zero(); - //if (csw!=0){ - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++){ - auto zz = Qx()(j, k)(a, b); - EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); - } - // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; - - EigenInvCloverOp = EigenCloverOp.inverse(); - //std::cout << EigenInvCloverOp << std::endl; - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++) - Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); - // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; - // } - pokeLocalSite(Qxinv, CloverTermInv, lcoor); + autoView(CTv,CloverTerm,CpuRead); + autoView(CTIv,CloverTermInv,CpuWrite); + for (int site = 0; site < lvol; site++) { + grid->LocalIndexToLocalCoor(site, lcoor); + EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + peekLocalSite(Qx, CTv, lcoor); + Qxinv = Zero(); + //if (csw!=0){ + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++){ + auto zz = Qx()(j, k)(a, b); + EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); + } + // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; + + EigenInvCloverOp = EigenCloverOp.inverse(); + //std::cout << EigenInvCloverOp << std::endl; + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++) + Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); + // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; + // } + pokeLocalSite(Qxinv, CTIv, lcoor); + } } // Separate the even and odd parts diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2a202a77..2cc308cc 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -580,16 +580,21 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh - for(int idx=0;idx<_grid->lSites();idx++){ - Coordinate lcoor(Nd); - Tcomplex cc; - // RealD sgn; - _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); - assert((double)real(cc)>=1.0); - assert(fabs((double)imag(cc))<=1.0e-15); - cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); + + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); + for(int idx=0;idx<_grid->lSites();idx++){ + Coordinate lcoor(Nd); + Tcomplex cc; + // RealD sgn; + _grid->LocalIndexToLocalCoor(idx,lcoor); + peekLocalSite(cc,cosha_v,lcoor); + assert((double)real(cc)>=1.0); + assert(fabs((double)imag(cc))<=1.0e-15); + cc = ScalComplex(::acosh(real(cc)),0.0); + pokeLocalSite(cc,a_v,lcoor); + } } Wea = ( exp( a) * abs(W) ); @@ -775,17 +780,20 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); for(int idx=0;idx<_grid->lSites();idx++){ Coordinate lcoor(Nd); Tcomplex cc; // RealD sgn; _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); + peekLocalSite(cc,cosha_v,lcoor); assert((double)real(cc)>=1.0); assert(fabs((double)imag(cc))<=1.0e-15); cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); - } + pokeLocalSite(cc,a_v,lcoor); + }} Wea = ( exp( a) * abs(W) ); Wema= ( exp(-a) * abs(W) ); diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 5267e0c1..4977ea68 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -43,7 +43,7 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p, const WilsonAnisotropyCoefficients &anis) - : + : Kernels(p), _grid(&Fgrid), _cbgrid(&Hgrid), @@ -67,11 +67,101 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, diag_mass = 4.0 + mass; } - + int vol4; + vol4=Fgrid.oSites(); + Stencil.BuildSurfaceList(1,vol4); + vol4=Hgrid.oSites(); + StencilEven.BuildSurfaceList(1,vol4); + StencilOdd.BuildSurfaceList(1,vol4); } +template +void WilsonFermion::Report(void) +{ + RealD NP = _grid->_Nprocessors; + RealD NN = _grid->NodeCount(); + RealD volume = 1; + Coordinate latt = _grid->GlobalDimensions(); + for(int mu=0;mu 0 ) { + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + } + + if ( DerivCalls > 0 ) { + std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < +void WilsonFermion::ZeroCounters(void) { + DhopCalls = 0; // ok + DhopCommTime = 0; + DhopComputeTime = 0; + DhopComputeTime2= 0; + DhopFaceTime = 0; + DhopTotalTime = 0; + + DerivCalls = 0; // ok + DerivCommTime = 0; + DerivComputeTime = 0; + DerivDhopComputeTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); +} + + template -void WilsonFermion::ImportGauge(const GaugeField &_Umu) +void WilsonFermion::ImportGauge(const GaugeField &_Umu) { GaugeField HUmu(_Umu.Grid()); @@ -102,7 +192,7 @@ void WilsonFermion::ImportGauge(const GaugeField &_Umu) ///////////////////////////// template -void WilsonFermion::M(const FermionField &in, FermionField &out) +void WilsonFermion::M(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); @@ -110,7 +200,7 @@ void WilsonFermion::M(const FermionField &in, FermionField &out) } template -void WilsonFermion::Mdag(const FermionField &in, FermionField &out) +void WilsonFermion::Mdag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerYes); @@ -118,7 +208,7 @@ void WilsonFermion::Mdag(const FermionField &in, FermionField &out) } template -void WilsonFermion::Meooe(const FermionField &in, FermionField &out) +void WilsonFermion::Meooe(const FermionField &in, FermionField &out) { if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerNo); @@ -128,7 +218,7 @@ void WilsonFermion::Meooe(const FermionField &in, FermionField &out) } template -void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) +void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerYes); @@ -136,9 +226,9 @@ void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) DhopOE(in, out, DaggerYes); } } - + template -void WilsonFermion::Mooee(const FermionField &in, FermionField &out) +void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); typename FermionField::scalar_type scal(diag_mass); @@ -146,80 +236,80 @@ void WilsonFermion::Mooee(const FermionField &in, FermionField &out) } template -void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); Mooee(in, out); } template -void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); out = (1.0/(diag_mass))*in; } - + template -void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) +void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); MooeeInv(in,out); } template void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector twist) -{ +{ typedef typename FermionField::vector_type vector_type; typedef typename FermionField::scalar_type ScalComplex; typedef Lattice > LatComplex; - - // what type LatticeComplex + + // what type LatticeComplex conformable(_grid,out.Grid()); - + Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT }; - + Coordinate latt_size = _grid->_fdimensions; - + FermionField num (_grid); num = Zero(); LatComplex wilson(_grid); wilson= Zero(); LatComplex one (_grid); one = ScalComplex(1.0,0.0); - + LatComplex denom(_grid); denom= Zero(); - LatComplex kmu(_grid); + LatComplex kmu(_grid); ScalComplex ci(0.0,1.0); // momphase = n * 2pi / L for(int mu=0;mu void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { + DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -237,8 +328,11 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; + DerivCommTime-=usecond(); st.HaloExchange(B, compressor); + DerivCommTime+=usecond(); + DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -246,6 +340,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; + DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -253,11 +348,13 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); + DerivDhopComputeTime += usecond(); } + DerivComputeTime += usecond(); } template -void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _grid); conformable(U.Grid(), V.Grid()); @@ -269,13 +366,13 @@ void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, cons } template -void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) // Motivation: look at the SchurDiff operator - + assert(V.Checkerboard() == Even); assert(U.Checkerboard() == Odd); mat.Checkerboard() = Odd; @@ -284,7 +381,7 @@ void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); @@ -298,7 +395,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) +void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -309,7 +406,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da } template -void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -321,7 +418,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int } template -void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) +void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -333,18 +430,18 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { DhopDir(in, out, dir, disp); } template -void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) { DhopDirAll(in, out); } template -void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); @@ -356,12 +453,12 @@ void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; template -void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); - assert((out.size()==8)||(out.size()==9)); + assert((out.size()==8)||(out.size()==9)); for(int dir=0;dir::DhopDirAll(const FermionField &in, std::vector -void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) { int Ls=1; uint64_t Nsite=in.oSites(); @@ -385,22 +482,23 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { + DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else -#endif +#endif DhopInternalSerial(st,lo,U,in,out,dag); - + DhopTotalTime+=usecond(); } template void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); @@ -412,38 +510,53 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); + DhopFaceTime-=usecond(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; + DhopComputeTime-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); - } + } + DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + DhopFaceTime-=usecond(); st.CommsMerge(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute exterior ///////////////////////////// + + DhopComputeTime2-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } + DhopComputeTime2+=usecond(); }; @@ -451,24 +564,28 @@ template void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); + DhopCommTime-=usecond(); st.HaloExchange(in, compressor); + DhopCommTime+=usecond(); + DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } + DhopComputeTime+=usecond(); }; /*Change ends */ /******************************************************************************* * Conserved current utilities for Wilson fermions, for contracting propagators - * to make a conserved current sink or inserting the conserved current + * to make a conserved current sink or inserting the conserved current * sequentially. ******************************************************************************/ template @@ -483,103 +600,23 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, conformable(_grid, q_in_1.Grid()); conformable(_grid, q_in_2.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - PropagatorField tmp1(_grid), tmp2(_grid); - q_out = Zero(); - - // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu). - // Inefficient comms method but not performance critical. - tmp1 = Cshift(q_in_1, mu, 1); - tmp2 = Cshift(q_in_2, mu, 1); - auto tmp1_v = tmp1.View(); - auto tmp2_v = tmp2.View(); - auto q_in_1_v=q_in_1.View(); - auto q_in_2_v=q_in_2.View(); - auto q_out_v = q_out.View(); - auto Umu_v = Umu.View(); - thread_for(sU, Umu.Grid()->oSites(),{ - Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], - q_in_2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU], - tmp2_v[sU], - q_out_v[sU], - Umu_v, sU, mu); - }); -#else -#endif + assert(0); } template -void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, +void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, PropagatorField &q_out, PropagatorField &src, Current curr_type, unsigned int mu, - unsigned int tmin, + unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx) { conformable(_grid, q_in.Grid()); conformable(_grid, q_out.Grid()); -#if 0 - - // Lattice> ph(_grid), coor(_grid); - Complex i(0.0,1.0); - PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid); - unsigned int tshift = (mu == Tp) ? 1 : 0; - unsigned int LLt = GridDefaultLatt()[Tp]; - - q_out = Zero(); - LatticeInteger coords(_grid); - LatticeCoordinate(coords, Tp); - - // Need q(x + mu) and q(x - mu). - tmp = Cshift(q_in, mu, 1); - tmpFwd = tmp*lattice_cmplx; - tmp = lattice_cmplx*q_in; - tmpBwd = Cshift(tmp, mu, -1); - - auto coords_v = coords.View(); - auto tmpFwd_v = tmpFwd.View(); - auto tmpBwd_v = tmpBwd.View(); - auto Umu_v = Umu.View(); - auto q_out_v = q_out.View(); - - thread_for(sU, Umu.Grid()->oSites(), { - - // Compute the sequential conserved current insertion only if our simd - // object contains a timeslice we need. - vPredicate t_mask; - t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax)); - Integer timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - - // Repeat for backward direction. - t_mask() = ((coords_v[sU] >= (tmin + tshift)) && - (coords_v[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) - unsigned int t0 = 0; - if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); - - timeSlices = Reduce(t_mask()); - - if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], - q_out_v[sU], - Umu_v, sU, mu, t_mask); - } - }); -#else -#endif + assert(0); } NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h new file mode 100644 index 00000000..2e587dfa --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -0,0 +1,574 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + + + Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer Regensburg University + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +//#if defined(A64FXASM) +#if defined(A64FX) + +// safety include +#include + +// undefine everything related to kernels +#include + +// enable A64FX body +#define WILSONKERNELSASMBODYA64FX +//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") + + /////////////////////////////////////////////////////////// + // If we are A64FX specialise the single precision routine + /////////////////////////////////////////////////////////// +#if defined(DSLASHINTRIN) +//#pragma message ("A64FX Dslash: intrin") +#include +#else +#pragma message ("A64FX Dslash: asm") +#include +#endif + +/// Switch off the 5d vectorised code optimisations +#undef DWFVEC5D + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, single +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, single +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + +// undefine +#include + +/////////////////////////////////////////////////////////// +// If we are A64FX specialise the double precision routine +/////////////////////////////////////////////////////////// + +#if defined(DSLASHINTRIN) +#include +#else +#include +#endif + +// former KNL +//#define MAYBEPERM(A,perm) if (perm) { A ; } +//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0]; + + +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, undag Kernel, double +///////////////////////////////////////////////////////////////// +#undef KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +///////////////////////////////////////////////////////////////// +// XYZT vectorised, dag Kernel, double +///////////////////////////////////////////////////////////////// +#define KERNEL_DAG +#define INTERIOR_AND_EXTERIOR +#undef INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#define INTERIOR +#undef EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +#undef INTERIOR_AND_EXTERIOR +#undef INTERIOR +#define EXTERIOR +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + +template<> void +WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +#if defined (WILSONKERNELSASMBODYA64FX) +#include +#else +#include +#endif + + + +// undefs +#undef WILSONKERNELSASMBODYA64FX +#include + +#endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h new file mode 100644 index 00000000..406e5c25 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -0,0 +1,380 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: WilsonKernelsAsmBodyA64FX.h + + Copyright (C) 2020 + +Author: Nils Meyer Regensburg University + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifdef KERNEL_DAG +#define DIR0_PROJ XP_PROJ +#define DIR1_PROJ YP_PROJ +#define DIR2_PROJ ZP_PROJ +#define DIR3_PROJ TP_PROJ +#define DIR4_PROJ XM_PROJ +#define DIR5_PROJ YM_PROJ +#define DIR6_PROJ ZM_PROJ +#define DIR7_PROJ TM_PROJ +#define DIR0_RECON XP_RECON +#define DIR1_RECON YP_RECON_ACCUM +#define DIR2_RECON ZP_RECON_ACCUM +#define DIR3_RECON TP_RECON_ACCUM +#define DIR4_RECON XM_RECON_ACCUM +#define DIR5_RECON YM_RECON_ACCUM +#define DIR6_RECON ZM_RECON_ACCUM +#define DIR7_RECON TM_RECON_ACCUM +#else +#define DIR0_PROJ XM_PROJ +#define DIR1_PROJ YM_PROJ +#define DIR2_PROJ ZM_PROJ +#define DIR3_PROJ TM_PROJ +#define DIR4_PROJ XP_PROJ +#define DIR5_PROJ YP_PROJ +#define DIR6_PROJ ZP_PROJ +#define DIR7_PROJ TP_PROJ +#define DIR0_RECON XM_RECON +#define DIR1_RECON YM_RECON_ACCUM +#define DIR2_RECON ZM_RECON_ACCUM +#define DIR3_RECON TM_RECON_ACCUM +#define DIR4_RECON XP_RECON_ACCUM +#define DIR5_RECON YP_RECON_ACCUM +#define DIR6_RECON ZP_RECON_ACCUM +#define DIR7_RECON TP_RECON_ACCUM +#endif + +//using namespace std; + +#undef SHOW +//#define SHOW + +#undef WHERE + +#ifdef INTERIOR_AND_EXTERIOR +#define WHERE "INT_AND_EXT" +#endif + +#ifdef INTERIOR +#define WHERE "INT" +#endif + +#ifdef EXTERIOR +#define WHERE "EXT" +#endif + +//#pragma message("here") + + + +//////////////////////////////////////////////////////////////////////////////// +// Comms then compute kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR_AND_EXTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + } else { \ + LOAD_CHI(base); \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Pre comms kernel -- prefetch like normal because it is mostly right +//////////////////////////////////////////////////////////////////////////////// +#ifdef INTERIOR + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + basep = st.GetPFInfo(nent,plocal); nent++; \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + PREFETCH_CHIMU_L2(basep); \ + } else { PREFETCH_CHIMU(base); } \ + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + PREFETCH1_CHIMU(base); \ + ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) + +#define RESULT(base,basep) SAVE_RESULT(base,basep); + +#endif +//////////////////////////////////////////////////////////////////////////////// +// Post comms kernel +//////////////////////////////////////////////////////////////////////////////// +#ifdef EXTERIOR + + +#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ + } + +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ + MULT_2SPIN_1(Dir); \ + PREFETCH_CHIMU(base); \ + /* PREFETCH_GAUGE_L1(NxtDir); */ \ + MULT_2SPIN_2; \ + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ + RECON; \ + nmu++; \ + } + +#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} + +#endif +{ + int nmu; + int local,perm, ptype; + uint64_t base; + uint64_t basep; + const uint64_t plocal =(uint64_t) & in[0]; + + MASK_REGS; + int nmax=U.oSites(); + for(int site=0;site=nmax) ssn=0; + // int sUn=lo.Reorder(ssn); + int sUn=ssn; + LOCK_GAUGE(0); +#else + int sU =ssU; + int ssn=ssU+1; if(ssn>=nmax) ssn=0; + int sUn=ssn; +#endif + for(int s=0;s +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#pragma once + +#include + + +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT + +#define REGISTER + +#define LOAD_CHIMU \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \ + std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \ + std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \ + std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \ + std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \ + std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \ + std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \ + std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \ + std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \ + std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \ + std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \ + std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \ + std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \ +} + +#define LOAD_CHI\ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2);\ + std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + } + +// To splat or not to splat depends on the implementation +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + Impl::loadLinkElement(U_00,ref()(0,0)); \ + Impl::loadLinkElement(U_10,ref()(1,0)); \ + Impl::loadLinkElement(U_20,ref()(2,0)); \ + Impl::loadLinkElement(U_01,ref()(0,1)); \ + Impl::loadLinkElement(U_11,ref()(1,1)); \ + Impl::loadLinkElement(U_21,ref()(2,1)); \ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + Impl::loadLinkElement(U_00,ref()(0,2)); \ + Impl::loadLinkElement(U_10,ref()(1,2)); \ + Impl::loadLinkElement(U_20,ref()(2,2)); \ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12;\ + std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \ + std::cout << "UChi_00 -- " << UChi_00 << std::endl; \ + std::cout << "UChi_01 -- " << UChi_01 << std::endl; \ + std::cout << "UChi_02 -- " << UChi_02 << std::endl; \ + std::cout << "UChi_10 -- " << UChi_10 << std::endl; \ + std::cout << "UChi_11 -- " << UChi_11 << std::endl; \ + std::cout << "UChi_12 -- " << UChi_12 << std::endl; \ + } + + +#define PERMUTE_DIR(dir) \ +std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \ +std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ +std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ +std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ +std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ +std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ +std::cout << "Chi_12 -- " << Chi_12 << std::endl; \ + permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_01,Chi_01);\ + permute##dir(Chi_02,Chi_02);\ + permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_11,Chi_11);\ + permute##dir(Chi_12,Chi_12);\ + std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22;\ + std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32;\ + std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22);\ + std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22;\ + std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32);\ + std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32;\ + std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \ + std::cout << "Chi_00 -- " << Chi_00 << std::endl; \ + std::cout << "Chi_01 -- " << Chi_01 << std::endl; \ + std::cout << "Chi_02 -- " << Chi_02 << std::endl; \ + std::cout << "Chi_10 -- " << Chi_10 << std::endl; \ + std::cout << "Chi_11 -- " << Chi_11 << std::endl; \ + std::cout << "Chi_12 -- " << Chi_12 << std::endl; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02);\ + std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02;\ + std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02;\ + std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12);\ + std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12;\ + std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12;\ + std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU; \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else if ( st.same_node[DIR] ) { \ + LOAD_CHI; \ + } \ + if (local || st.same_node[DIR] ) { \ + MULT_2SPIN(DIR); \ + RECON; \ + } + +#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ + SE=st.GetEntry(ptype,DIR,ss); \ + offset = SE->_offset; \ + if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ + LOAD_CHI; \ + MULT_2SPIN(DIR); \ + RECON; \ + nmu++; \ + } + +#define HAND_RESULT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + vstream(ref()(0)(0),result_00); \ + vstream(ref()(0)(1),result_01); \ + vstream(ref()(0)(2),result_02); \ + vstream(ref()(1)(0),result_10); \ + vstream(ref()(1)(1),result_11); \ + vstream(ref()(1)(2),result_12); \ + vstream(ref()(2)(0),result_20); \ + vstream(ref()(2)(1),result_21); \ + vstream(ref()(2)(2),result_22); \ + vstream(ref()(3)(0),result_30); \ + vstream(ref()(3)(1),result_31); \ + vstream(ref()(3)(2),result_32); \ + std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + +#define HAND_RESULT_EXT(ss) \ + if (nmu){ \ + SiteSpinor & ref (out[ss]); \ + ref()(0)(0)+=result_00; \ + ref()(0)(1)+=result_01; \ + ref()(0)(2)+=result_02; \ + ref()(1)(0)+=result_10; \ + ref()(1)(1)+=result_11; \ + ref()(1)(2)+=result_12; \ + ref()(2)(0)+=result_20; \ + ref()(2)(1)+=result_21; \ + ref()(2)(2)+=result_22; \ + ref()(3)(0)+=result_30; \ + ref()(3)(1)+=result_31; \ + ref()(3)(2)+=result_32; \ + std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \ + std::cout << "result_00 -- " << result_00 << std::endl; \ + std::cout << "result_01 -- " << result_01 << std::endl; \ + std::cout << "result_02 -- " << result_02 << std::endl; \ + std::cout << "result_10 -- " << result_10 << std::endl; \ + std::cout << "result_11 -- " << result_11 << std::endl; \ + std::cout << "result_12 -- " << result_12 << std::endl; \ + std::cout << "result_20 -- " << result_20 << std::endl; \ + std::cout << "result_21 -- " << result_21 << std::endl; \ + std::cout << "result_22 -- " << result_22 << std::endl; \ + std::cout << "result_30 -- " << result_30 << std::endl; \ + std::cout << "result_31 -- " << result_31 << std::endl; \ + std::cout << "result_32 -- " << result_32 << std::endl;\ + } + + +#define HAND_DECLARATIONS(a) \ + Simd result_00; \ + Simd result_01; \ + Simd result_02; \ + Simd result_10; \ + Simd result_11; \ + Simd result_12; \ + Simd result_20; \ + Simd result_21; \ + Simd result_22; \ + Simd result_30; \ + Simd result_31; \ + Simd result_32; \ + Simd Chi_00; \ + Simd Chi_01; \ + Simd Chi_02; \ + Simd Chi_10; \ + Simd Chi_11; \ + Simd Chi_12; \ + Simd UChi_00; \ + Simd UChi_01; \ + Simd UChi_02; \ + Simd UChi_10; \ + Simd UChi_11; \ + Simd UChi_12; \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21;\ + Simd debugreg;\ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + +#define ZERO_RESULT \ + result_00=Zero(); \ + result_01=Zero(); \ + result_02=Zero(); \ + result_10=Zero(); \ + result_11=Zero(); \ + result_12=Zero(); \ + result_20=Zero(); \ + result_21=Zero(); \ + result_22=Zero(); \ + result_30=Zero(); \ + result_31=Zero(); \ + result_32=Zero(); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + +NAMESPACE_BEGIN(Grid); + +template void +WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + + HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); + HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset,local,perm, ptype; + StencilEntry *SE; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset,local,perm, ptype; + ZERO_RESULT; + HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT(ss); +} + +template void +WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + int offset, ptype; + StencilEntry *SE; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +template +void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + HAND_DECLARATIONS(ignore); + + StencilEntry *SE; + int offset, ptype; + int nmu=0; + ZERO_RESULT; + HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); + HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); + HAND_RESULT_EXT(ss); +} + +////////////// Wilson ; uses this implementation ///////////////////// + +NAMESPACE_END(Grid); +#undef LOAD_CHIMU +#undef LOAD_CHI +#undef MULT_2SPIN +#undef PERMUTE_DIR +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XP_RECON_ACCUM +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef ZERO_RESULT +#undef Chimu_00 +#undef Chimu_01 +#undef Chimu_02 +#undef Chimu_10 +#undef Chimu_11 +#undef Chimu_12 +#undef Chimu_20 +#undef Chimu_21 +#undef Chimu_22 +#undef Chimu_30 +#undef Chimu_31 +#undef Chimu_32 +#undef HAND_STENCIL_LEG +#undef HAND_STENCIL_LEG_INT +#undef HAND_STENCIL_LEG_EXT +#undef HAND_RESULT +#undef HAND_RESULT_INT +#undef HAND_RESULT_EXT diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 0ff72789..c2b62416 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -39,19 +39,21 @@ NAMESPACE_BEGIN(Grid); // Generic implementation; move to different file? //////////////////////////////////////////// +/* accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) { -#ifdef __CUDA_ARCH__ - static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); +#ifdef GRID_SIMT + static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads uint4 * chip_pun = (uint4 *)&chip; * chip_pun = * mem_pun; -#else +#else chip = *mem; #endif return; } - +*/ + #define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ if (SE->_is_local) { \ @@ -61,10 +63,10 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); - + #define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ if (SE->_is_local) { \ @@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else if ( st.same_node[Dir] ) { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ if (SE->_is_local || st.same_node[Dir] ) { \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Recon(result, Uchi); \ } \ - synchronise(); + acceleratorSynchronise(); #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ SE = st.GetEntry(ptype, Dir, sF); \ @@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) Recon(result, Uchi); \ nmu++; \ } \ - synchronise(); + acceleratorSynchronise(); #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ if (SE->_is_local ) { \ @@ -99,9 +101,9 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) } else { \ chi = coalescedRead(buf[SE->_offset],lane); \ } \ - synchronise(); \ + acceleratorSynchronise(); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ - Recon(result, Uchi); + Recon(result, Uchi); #define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \ if (gamma == Dir) { \ @@ -126,7 +128,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); @@ -141,7 +143,7 @@ void WilsonKernels::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV template void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -153,7 +155,7 @@ void WilsonKernels::GenericDhopSite(StencilView &st, DoubledGaugeFieldView int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); @@ -181,7 +183,7 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); @@ -198,12 +200,12 @@ void WilsonKernels::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi template void WilsonKernels::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); calcHalfSpinor chi; // calcHalfSpinor *chi_p; @@ -239,7 +241,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi int ptype; int nmu=0; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); @@ -249,7 +251,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm); GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm); - if ( nmu ) { + if ( nmu ) { auto out_t = coalescedRead(out[sF],lane); out_t = out_t + result; coalescedWrite(out[sF],out_t,lane); @@ -259,7 +261,7 @@ void WilsonKernels::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi template void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out) + int sU, const FermionFieldView &in, FermionFieldView &out) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -270,7 +272,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField int ptype; int nmu=0; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); result=Zero(); GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); @@ -280,7 +282,7 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm); GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm); GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm); - if ( nmu ) { + if ( nmu ) { auto out_t = coalescedRead(out[sF],lane); out_t = out_t + result; coalescedWrite(out[sF],out_t,lane); @@ -300,12 +302,12 @@ void WilsonKernels::GenericDhopSiteExt(StencilView &st, DoubledGaugeField StencilEntry *SE; \ int ptype; \ const int Nsimd = SiteHalfSpinor::Nsimd(); \ - const int lane=SIMTlane(Nsimd); \ + const int lane=acceleratorSIMTlane(Nsimd); \ \ SE = st.GetEntry(ptype, dir, sF); \ GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ coalescedWrite(out[sF], result,lane); \ - } + } DhopDirMacro(Xp,spProjXp,spReconXp); DhopDirMacro(Yp,spProjYp,spReconYp); @@ -316,9 +318,9 @@ DhopDirMacro(Ym,spProjYm,spReconYm); DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Tm,spProjTm,spReconTm); -template +template void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, - int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) + int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) { typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor; @@ -328,7 +330,7 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si StencilEntry *SE; int ptype; const int Nsimd = SiteHalfSpinor::Nsimd(); - const int lane=SIMTlane(Nsimd); + const int lane=acceleratorSIMTlane(Nsimd); SE = st.GetEntry(ptype, dir, sF); GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); @@ -344,54 +346,55 @@ void WilsonKernels::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si template void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, - int Nsite, const FermionField &in, std::vector &out) + int Nsite, const FermionField &in, std::vector &out) { - auto U_v = U.View(); - auto in_v = in.View(); - auto st_v = st.View(); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(st_v ,st,AcceleratorRead); - auto out_Xm = out[0].View(); - auto out_Ym = out[1].View(); - auto out_Zm = out[2].View(); - auto out_Tm = out[3].View(); - auto out_Xp = out[4].View(); - auto out_Yp = out[5].View(); - auto out_Zp = out[6].View(); - auto out_Tp = out[7].View(); - - accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ - int sU=sss/Ls; - int sF =sss; - DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); - DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); - DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); - DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); - DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); - DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); - DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); - DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); + autoView(out_Xm,out[0],AcceleratorWrite); + autoView(out_Ym,out[1],AcceleratorWrite); + autoView(out_Zm,out[2],AcceleratorWrite); + autoView(out_Tm,out[3],AcceleratorWrite); + autoView(out_Xp,out[4],AcceleratorWrite); + autoView(out_Yp,out[5],AcceleratorWrite); + autoView(out_Zp,out[6],AcceleratorWrite); + autoView(out_Tp,out[7],AcceleratorWrite); + auto CBp=st.CommBuf(); + accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{ + int sU=sss/Ls; + int sF =sss; + DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0); + DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1); + DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2); + DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3); + DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4); + DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5); + DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6); + DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7); }); } template void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, - int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) + int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma) { assert(dirdisp<=7); assert(dirdisp>=0); - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v ,U ,AcceleratorRead); + autoView(in_v ,in ,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st ,AcceleratorRead); + auto CBp=st.CommBuf(); #define LoopBody(Dir) \ - case Dir : \ - accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \ + case Dir : \ + accelerator_for(ss,Nsite,Simd::Nsimd(),{ \ for(int s=0;s::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S break; } #undef LoopBody -} +} #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -421,7 +424,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ }); -#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); +#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define ASM_CALL(A) \ thread_for( ss, Nsite, { \ @@ -433,28 +436,28 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior,int exterior) + int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); - if( interior && exterior ) { + if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif - } else if( exterior ) { + } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif @@ -464,28 +467,28 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, - int interior,int exterior) + int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st,AcceleratorRead); - if( interior && exterior ) { + if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif - } else if( exterior ) { + } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} -#ifndef GRID_NVCC +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif @@ -493,5 +496,8 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField assert(0 && " Kernel optimisation case not covered "); } -NAMESPACE_END(Grid); +#undef KERNEL_CALLNB +#undef KERNEL_CALL +#undef ASM_CALL +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc new file mode 100644 index 00000000..c424cb2d --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc @@ -0,0 +1,36 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +const std::vector NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3}); +const std::vector NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1}); + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master new file mode 100644 index 00000000..75b75678 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc.master @@ -0,0 +1,37 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc + +Copyright (C) 2015 + +Author: Azusa Yamaguchi, Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class NaiveStaggeredFermion; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplD/NaiveStaggeredFermionInstantiationStaggeredImplD.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc new file mode 120000 index 00000000..42057f56 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/StaggeredImplF/NaiveStaggeredFermionInstantiationStaggeredImplF.cc @@ -0,0 +1 @@ +../NaiveStaggeredFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master index 9af5ed85..f0b15e3b 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master @@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/WilsonKernels.cc -Copyright (C) 2015 +Copyright (C) 2015, 2020 Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Nils Meyer Regensburg University This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,14 +35,17 @@ directory #ifndef AVX512 #ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE #include #endif #endif +#endif +#endif NAMESPACE_BEGIN(Grid); #include "impl.h" -template class WilsonKernels; +template class WilsonKernels; NAMESPACE_END(Grid); - diff --git a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc index f6f235c8..a8e9e6d9 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc +++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationAsm.cc @@ -37,6 +37,7 @@ directory //////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); #include +#include #include NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc deleted file mode 120000 index 01c35e7b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc new file mode 100644 index 00000000..f0b15e3b --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc @@ -0,0 +1,51 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/fermion/WilsonKernels.cc + +Copyright (C) 2015, 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle +Author: Nils Meyer Regensburg University + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +#ifndef AVX512 +#ifndef QPX +#ifndef A64FX +#ifndef A64FXFIXEDSIZE +#include +#endif +#endif +#endif +#endif + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class WilsonKernels; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index 330dcfa8..72a9eaf9 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -88,6 +88,7 @@ done CC_LIST=" \ ImprovedStaggeredFermion5DInstantiation \ ImprovedStaggeredFermionInstantiation \ + NaiveStaggeredFermionInstantiation \ StaggeredKernelsInstantiation " for impl in $STAG_IMPL_LIST diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index b9a5296d..9b7d5a60 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -86,9 +86,9 @@ public: // Move this elsewhere? FIXME static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W - auto U_v = U.View(); - auto W_v = W.View(); - thread_for( ss, U.Grid()->oSites(), { + autoView(U_v,U,AcceleratorWrite); + autoView(W_v,W,AcceleratorRead); + accelerator_for( ss, U.Grid()->oSites(), 1, { U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); }); } @@ -131,15 +131,14 @@ public: //static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); - auto U_v = U.View(); - auto P_v = P.View(); - thread_for(ss, P.Grid()->oSites(),{ + autoView(U_v,U,AcceleratorWrite); + autoView(P_v,P,AcceleratorRead); + accelerator_for(ss, P.Grid()->oSites(),1,{ for (int mu = 0; mu < Nd; mu++) { U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); } }); - - //auto end = std::chrono::high_resolution_clock::now(); + //auto end = std::chrono::high_resolution_clock::now(); // diff += end - start; // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; } diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index febb315e..14675b11 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -1,5 +1,13 @@ #pragma once +#define CPS_MD_TIME + +#ifdef CPS_MD_TIME +#define HMC_MOMENTUM_DENOMINATOR (2.0) +#else +#define HMC_MOMENTUM_DENOMINATOR (1.0) +#endif + NAMESPACE_BEGIN(Grid); template @@ -20,7 +28,9 @@ public: typedef Field PropagatorField; static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ + RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling gaussian(pRNG, P); + P *= scale; } static inline Field projectForce(Field& P){return P;} @@ -66,7 +76,7 @@ public: } static void FreePropagator(const Field &in, Field &out, - const Field &momKernel) + const Field &momKernel) { FFT fft((GridCartesian *)in.Grid()); Field inFT(in.Grid()); @@ -139,14 +149,17 @@ public: static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) { + RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling #ifndef USE_FFT_ACCELERATION Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P); + #else Field Pgaussian(P.Grid()), Pp(P.Grid()); ComplexField p2(P.Grid()); p2 = zero; RealD M = FFT_MASS; - + + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian); FFT theFFT((GridCartesian*)P.Grid()); @@ -156,17 +169,17 @@ public: p2 = sqrt(p2); Pp *= p2; theFFT.FFT_all_dim(P, Pp, FFT::backward); - #endif //USE_FFT_ACCELERATION + P *= scale; } - static inline Field projectForce(Field& P) {return P;} + static inline Field projectForce(Field& P) {return Ta(P);} static inline void update_field(Field &P, Field &U, double ep) { #ifndef USE_FFT_ACCELERATION double t0=usecond(); - U += P*ep; + U += P*ep; double t1=usecond(); double total_time = (t1-t0)/1e6; std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl; diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 3be84480..5a5f9251 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -89,8 +89,8 @@ public: action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; - auto p_v = p.View(); - auto action_v = action.View(); + autoView( p_v , p, CpuRead); + autoView( action_v , action, CpuWrite); for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils @@ -146,8 +146,8 @@ public: for (int point = 0; point < npoint; point++) { - auto p_v = p.View(); - auto force_v = force.View(); + autoView( p_v , p, CpuRead); + autoView( force_v , force, CpuWrite); int permute_type; StencilEntry *SE; diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h index 459e1d0b..28a9fdae 100644 --- a/Grid/qcd/modules/Registration.h +++ b/Grid/qcd/modules/Registration.h @@ -80,10 +80,11 @@ static Registrar, static Registrar< ConjugateGradientModule, HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); -//static Registrar< BiCGSTABModule, -// HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); -//static Registrar< ConjugateResidualModule, -// HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); + +static Registrar< BiCGSTABModule, + HMC_SolverModuleFactory > __BiCGWFmodXMLInit("BiCGSTAB"); +static Registrar< ConjugateResidualModule, + HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); // add the staggered, scalar versions here diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index f4d00c72..0ff7fc25 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -49,7 +49,7 @@ public: private: const unsigned int smearingLevels; - Smear_Stout StoutSmearing; + Smear_Stout *StoutSmearing; std::vector SmearedSet; // Member functions @@ -72,7 +72,7 @@ private: previous_u = *ThinLinks; for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) { - StoutSmearing.smear(SmearedSet[smearLvl], previous_u); + StoutSmearing->smear(SmearedSet[smearLvl], previous_u); previous_u = SmearedSet[smearLvl]; // For debug purposes @@ -93,7 +93,7 @@ private: GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid); - StoutSmearing.BaseSmear(C, GaugeK); + StoutSmearing->BaseSmear(C, GaugeK); SigmaK = Zero(); iLambda = Zero(); @@ -107,7 +107,7 @@ private: pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu); } - StoutSmearing.derivative(SigmaK, iLambda, + StoutSmearing->derivative(SigmaK, iLambda, GaugeK); // derivative of SmearBase return SigmaK; } @@ -144,14 +144,14 @@ private: // Exponential iQ2 = iQ * iQ; iQ3 = iQ * iQ2; - StoutSmearing.set_uw(u, w, iQ2, iQ3); - StoutSmearing.set_fj(f0, f1, f2, u, w); + StoutSmearing->set_uw(u, w, iQ2, iQ3); + StoutSmearing->set_fj(f0, f1, f2, u, w); e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2; // Getting B1, B2, Gamma and Lambda // simplify this part, reduntant calculations in set_fj - xi0 = StoutSmearing.func_xi0(w); - xi1 = StoutSmearing.func_xi1(w); + xi0 = StoutSmearing->func_xi0(w); + xi1 = StoutSmearing->func_xi1(w); u2 = u * u; w2 = w * w; cosw = cos(w); @@ -219,7 +219,7 @@ public: /* Standard constructor */ SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, Smear_Stout& Stout) - : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) + : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL) { for (unsigned int i = 0; i < smearingLevels; ++i) SmearedSet.push_back(*(new GaugeField(UGrid))); @@ -227,7 +227,7 @@ public: /*! For just thin links */ SmearedConfiguration() - : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} + : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {} // attach the smeared routines to the thin links U and fill the smeared set void set_Field(GaugeField &U) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index c7c7d329..b63d8571 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -185,13 +185,14 @@ void A2Autils::MesonField(TensorType &mat, for(int i=0;i::MesonField(TensorType &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::PionFieldXX(Eigen::Tensor &mat, for(int i=0;i::PionFieldXX(Eigen::Tensor &mat, } for(int j=0;j::PionFieldWVmom(Eigen::Tensor &mat, for(int i=0;i::PionFieldWVmom(Eigen::Tensor &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, for(int i=0;i::AslashField(TensorType &mat, for ( int m=0;m::ContractWWVV(std::vector &WWVV, for(int d_o=0;d_o::ContractWWVV(std::vector &WWVV, thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o::OuterProductWWVV(PropagatorField &WWVV, const vobj &rhs, const int Ns, const int ss) { - auto WWVV_v = WWVV.View(); + autoView(WWVV_v,WWVV,CpuWrite); for (int s1 = 0; s1 < Ns; s1++){ for (int s2 = 0; s2 < Ns; s2++){ WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); @@ -1122,10 +1121,10 @@ void A2Autils::ContractFourQuarkColourDiagonal(const PropagatorField &WWV GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + autoView(WWVV0_v , WWVV0,CpuRead); + autoView(WWVV1_v , WWVV1,CpuRead); + autoView(O_trtr_v, O_trtr,CpuWrite); + autoView(O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ typedef typename ComplexField::vector_object vobj; @@ -1166,10 +1165,10 @@ void A2Autils::ContractFourQuarkColourMix(const PropagatorField &WWVV0, GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + autoView( WWVV0_v , WWVV0,CpuRead); + autoView( WWVV1_v , WWVV1,CpuRead); + autoView( O_trtr_v, O_trtr,CpuWrite); + autoView( O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index fa2f3376..b268b684 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -7,6 +7,7 @@ Copyright (C) 2019 Author: Felix Erben + Author: Raoul Hodgson This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -46,7 +47,7 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const Complex epsilon_sgn[6]; + static const Real epsilon_sgn[6]; private: template @@ -58,9 +59,12 @@ public: const Gamma GammaA_right, const Gamma GammaB_right, const int parity, - const int * wick_contractions, + const bool * wick_contractions, robj &result); public: + static void Wick_Contractions(std::string qi, + std::string qf, + bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, @@ -68,8 +72,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, ComplexField &baryon_corr); template @@ -80,10 +83,59 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, + const int nt, robj &result); + private: + template + static void Baryon_Gamma_3pt_Group1_Site( + const mobj &Dq1_ti, + const mobj2 &Dq2_spec, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + + template + static void Baryon_Gamma_3pt_Group2_Site( + const mobj2 &Dq1_spec, + const mobj &Dq2_ti, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + + template + static void Baryon_Gamma_3pt_Group3_Site( + const mobj2 &Dq1_spec, + const mobj2 &Dq2_spec, + const mobj &Dq3_ti, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result); + public: + template + static void Baryon_Gamma_3pt( + const PropagatorField &q_ti, + const mobj &Dq_spec1, + const mobj &Dq_spec2, + const PropagatorField &q_tf, + int group, + int wick_contraction, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + SpinMatrixField &stn_corr); private: template static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, @@ -151,119 +203,152 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -template +/*template const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), Complex(1), Complex(1), Complex(-1), Complex(-1), Complex(-1)}; +*/ +template +const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; //This is the old version template template void BaryonUtils::baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const int parity, + const bool * wick_contraction, + robj &result) { - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + + auto D1_GAi = D1 * GammaA_i; + auto D1_GAi_g4 = D1_GAi * g4; + auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); + auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; + auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); - auto gD3 = GammaB_right * D3; + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; - auto D2g = D2 * GammaB_left; - auto pD1g = pD1 * GammaB_left; - auto gD3g = gD3 * GammaB_left; + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = epsilon[ie_f][0]; //a + int b_f = epsilon[ie_f][1]; //b + int c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = epsilon[ie_i][0]; //a' + int b_i = epsilon[ie_i][1]; //b' + int c_i = epsilon[ie_i][2]; //c' + + Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - for (int gamma_left=0; gamma_left +void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { + const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; + for (int ie=0; ie < 6 ; ie++) { + wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 + && qi[0] == qf[epsilon[ie][0]] + && qi[1] == qf[epsilon[ie][1]] + && qi[2] == qf[epsilon[ie][2]]); } } +/* The array wick_contractions must be of length 6. The order * + * corresponds to the to that shown in the Hadrons documentation * + * at https://aportelli.github.io/Hadrons-doc/#/mcontraction * + * This can be computed from the quark flavours using the * + * Wick_Contractions function above */ template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -272,8 +357,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, ComplexField &baryon_corr) { @@ -281,38 +365,53 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); GridBase *grid = q1_left.Grid(); + + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); - - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - //for(int ss=0; ss < grid->oSites(); ss++){ + Real bytes =0.; + bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); + for (int ie=0; ie < 6 ; ie++){ + if(ie==0 or ie==3){ + bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + } + else{ + bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + } + } + Real t=0.; + t =-usecond(); + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto D1 = v1[ss]; auto D2 = v2[ss]; auto D3 = v3[ss]; - vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites + + t += usecond(); + + std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + } + +/* The array wick_contractions must be of length 6. The order * + * corresponds to the to that shown in the Hadrons documentation * + * at https://aportelli.github.io/Hadrons-doc/#/mcontraction * + * This can also be computed from the quark flavours using the * + * Wick_Contractions function above */ template template void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, @@ -322,34 +421,363 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const char * quarks_left, - const char * quarks_right, + const bool* wick_contractions, const int parity, + const int nt, robj &result) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - - result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + for (int t=0; t +template +void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( + const mobj &Dq1_ti, + const mobj2 &Dq2_spec, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; + auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + auto Gf_D3 = GammaBf * Dq3_spec; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( + const mobj2 &Dq1_spec, + const mobj &Dq2_ti, + const mobj2 &Dq3_spec, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; + auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; + auto Gf_D1 = GammaBf * Dq1_spec; + auto Gf_D3 = GammaBf * Dq3_spec; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( + const mobj2 &Dq1_spec, + const mobj2 &Dq2_spec, + const mobj &Dq3_ti, + const mobj &Dq4_tf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + int wick_contraction, + robj &result) +{ + Gamma g5(Gamma::Algebra::Gamma5); + + auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; + auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; + auto Gf_D1 = GammaBf * Dq1_spec; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + + int a_f, b_f, c_f; + int a_i, b_i, c_i; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + a_f = epsilon[ie_f][0]; //a + b_f = epsilon[ie_f][1]; //b + c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + a_i = epsilon[ie_i][0]; //a' + b_i = epsilon[ie_i][1]; //b' + c_i = epsilon[ie_i][2]; //c' + + ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + + for (int alpha_f=0; alpha_f +template +void BaryonUtils::Baryon_Gamma_3pt( + const PropagatorField &q_ti, + const mobj &Dq_spec1, + const mobj &Dq_spec2, + const PropagatorField &q_tf, + int group, + int wick_contraction, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, + SpinMatrixField &stn_corr) +{ + GridBase *grid = q_tf.Grid(); + + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_ti , q_ti, CpuRead); + autoView( vq_tf , q_tf, CpuRead); + + if (group == 1) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + vcorr[ss] += result; + });//end loop over lattice sites + } else if (group == 2) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + vcorr[ss] += result; + });//end loop over lattice sites + } else if (group == 3) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti[ss]; + auto Dq_tf = vq_tf[ss]; + sobj result=Zero(); + Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + + vcorr[ss] += result; + });//end loop over lattice sites + } +} + + +/*********************************************************************** + * End of BaryonGamma3pt-function code. * + * * * The following code is for Sigma -> N rare hypeon decays * **********************************************************************/ @@ -590,13 +1018,12 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_loop = qq_loop.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_loop , qq_loop, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_loop = vq_loop[ss]; auto Dd_tf = vd_tf[ss]; auto Ds_ti = vs_ti[ss]; @@ -631,12 +1058,11 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_ti = qq_ti.View(); - auto vq_tf = qq_tf.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); - + autoView( vcorr , stn_corr, CpuWrite); + autoView( vq_ti , qq_ti, CpuRead); + autoView( vq_tf , qq_tf, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ auto Dq_ti = vq_ti[ss]; diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 56f8f164..1e016e4e 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -47,8 +47,8 @@ void axpibg5x(Lattice &z,const Lattice &x,Coeff a,Coeff b) GridBase *grid=x.Grid(); Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + autoView(x_v, x, AcceleratorRead); + autoView(z_v, z, AcceleratorWrite); accelerator_for( ss, x_v.size(),vobj::Nsimd(), { auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); coalescedWrite(z_v[ss],tmp); @@ -63,9 +63,9 @@ void axpby_ssp(Lattice &z, Coeff a,const Lattice &x,Coeff b,const La conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); // FIXME -- need a new class of accelerator_loop to implement this // uint64_t nloop = grid->oSites()/Ls; @@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice &z,Coeff a,const Lattice &x,Coeff b,con conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -189,8 +189,8 @@ void G5R5(Lattice &z,const Lattice &x) conformable(x,z); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -222,8 +222,8 @@ void G5C(Lattice> &z, const LatticeoSites(),CComplex::Nsimd(), { for(int n = 0; n < nb; ++n) { diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ad80d00..0cc0cc1a 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -222,11 +222,11 @@ public: conformable(subgroup, Determinant); int i0, i1; su2SubGroupIndex(i0, i1, su2_index); - auto subgroup_v = subgroup.View(); - auto source_v = source.View(); - auto Determinant_v = Determinant.View(); - thread_for(ss, grid->oSites(), { + autoView( subgroup_v , subgroup,AcceleratorWrite); + autoView( source_v , source,AcceleratorRead); + autoView( Determinant_v , Determinant,AcceleratorWrite); + accelerator_for(ss, grid->oSites(), 1, { subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0); subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1); @@ -257,15 +257,16 @@ public: su2SubGroupIndex(i0, i1, su2_index); dest = 1.0; // start out with identity - auto dest_v = dest.View(); - auto subgroup_v = subgroup.View(); - thread_for(ss, grid->oSites(), + autoView( dest_v , dest, AcceleratorWrite); + autoView( subgroup_v, subgroup, AcceleratorRead); + accelerator_for(ss, grid->oSites(),1, { dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1); dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0); dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1); }); + } /////////////////////////////////////////////// @@ -608,8 +609,8 @@ public: // reunitarise?? template - static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, - double scale = 1.0) { + static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) + { GridBase *grid = out.Grid(); typedef typename LatticeMatrixType::vector_type vector_type; @@ -618,8 +619,7 @@ public: typedef iSinglet vTComplexType; typedef Lattice LatticeComplexType; - typedef typename GridTypeMapper< - typename LatticeMatrixType::vector_object>::scalar_object MatrixType; + typedef typename GridTypeMapper::scalar_object MatrixType; LatticeComplexType ca(grid); LatticeMatrixType lie(grid); @@ -629,6 +629,7 @@ public: MatrixType ta; lie = Zero(); + for (int a = 0; a < AdjointDimension; a++) { random(pRNG, ca); @@ -640,6 +641,7 @@ public: la = ci * ca * ta; lie = lie + la; // e^{i la ta} + } taExp(lie, out); } diff --git a/Grid/serialisation/Serialisation.h b/Grid/serialisation/Serialisation.h index c95226b1..e14120af 100644 --- a/Grid/serialisation/Serialisation.h +++ b/Grid/serialisation/Serialisation.h @@ -36,7 +36,7 @@ Author: Peter Boyle #include "BinaryIO.h" #include "TextIO.h" #include "XmlIO.h" -#ifndef GRID_NVCC +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) #include "JSON_IO.h" #endif diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h new file mode 100644 index 00000000..76c556d7 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -0,0 +1,779 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_asm_double.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "fmov z31.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXd(base) \ +{ \ +asm ( \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ +asm ( \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERMUTE +#define PERMUTE_A64FXd \ +asm ( \ + "tbl z12.d, { z12.d }, z30.d \n\t" \ + "tbl z13.d, { z13.d }, z30.d \n\t" \ + "tbl z14.d, { z14.d }, z30.d \n\t" \ + "tbl z15.d, { z15.d }, z30.d \n\t" \ + "tbl z16.d, { z16.d }, z30.d \n\t" \ + "tbl z17.d, { z17.d }, z30.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "movprfx z18.d, p5/m, z31.d \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ + "movprfx z21.d, p5/m, z31.d \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ + "movprfx z19.d, p5/m, z31.d \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ + "movprfx z22.d, p5/m, z31.d \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ + "movprfx z20.d, p5/m, z31.d \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ + "movprfx z23.d, p5/m, z31.d \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ +asm ( \ + "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ + "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ + "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ + "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ + "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ + "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ + "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ +asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ +asm ( \ + "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ +asm ( \ + "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ + "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ + "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ + "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ +asm ( \ + "movprfx z6.d, p5/m, z31.d \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "movprfx z7.d, p5/m, z31.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "movprfx z8.d, p5/m, z31.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "movprfx z9.d, p5/m, z31.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "movprfx z10.d, p5/m, z31.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "movprfx z11.d, p5/m, z31.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "mov z0.d, p5/m, z18.d \n\t" \ + "mov z1.d, p5/m, z19.d \n\t" \ + "mov z2.d, p5/m, z20.d \n\t" \ + "mov z3.d, p5/m, z21.d \n\t" \ + "mov z4.d, p5/m, z22.d \n\t" \ + "mov z5.d, p5/m, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ +asm ( \ + "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ + "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ + "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ +asm ( \ + "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ + "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ + "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ + "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ + "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ + "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ +asm ( \ + "ptrue p5.d \n\t" \ + "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ + "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ + "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ + "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ + "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ + "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ + "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ + "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ + "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ + "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ + "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ +asm ( \ + "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ + "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ + "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ + "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ + "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ + "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ + "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ +asm ( \ + "ptrue p5.d \n\t" \ + "fmov z0.d , 0 \n\t" \ + "fmov z1.d , 0 \n\t" \ + "fmov z2.d , 0 \n\t" \ + "fmov z3.d , 0 \n\t" \ + "fmov z4.d , 0 \n\t" \ + "fmov z5.d , 0 \n\t" \ + "fmov z6.d , 0 \n\t" \ + "fmov z7.d , 0 \n\t" \ + "fmov z8.d , 0 \n\t" \ + "fmov z9.d , 0 \n\t" \ + "fmov z10.d , 0 \n\t" \ + "fmov z11.d , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ +asm ( \ + "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ + "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ + "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ + "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ + "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ + "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ + "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ + "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ + "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ + "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ + "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ + "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h new file mode 100644 index 00000000..d809f83b --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -0,0 +1,779 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_asm_single.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "fmov z31.s , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// RESULT +#define RESULT_A64FXf(base) \ +{ \ +asm ( \ + "str z0, [%[storeptr], -6, mul vl] \n\t" \ + "str z1, [%[storeptr], -5, mul vl] \n\t" \ + "str z2, [%[storeptr], -4, mul vl] \n\t" \ + "str z3, [%[storeptr], -3, mul vl] \n\t" \ + "str z4, [%[storeptr], -2, mul vl] \n\t" \ + "str z5, [%[storeptr], -1, mul vl] \n\t" \ + "str z6, [%[storeptr], 0, mul vl] \n\t" \ + "str z7, [%[storeptr], 1, mul vl] \n\t" \ + "str z8, [%[storeptr], 2, mul vl] \n\t" \ + "str z9, [%[storeptr], 3, mul vl] \n\t" \ + "str z10, [%[storeptr], 4, mul vl] \n\t" \ + "str z11, [%[storeptr], 5, mul vl] \n\t" \ + : \ + : [storeptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ +asm ( \ + "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ + "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ +asm ( \ + "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ + "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ + "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ + "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (&ref[2][0]) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (0) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (1) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (2) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ +asm ( \ + "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ + : \ + : [tableptr] "r" (&lut[0]),[index] "i" (3) \ + : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PERMUTE +#define PERMUTE_A64FXf \ +asm ( \ + "tbl z12.s, { z12.s }, z30.s \n\t" \ + "tbl z13.s, { z13.s }, z30.s \n\t" \ + "tbl z14.s, { z14.s }, z30.s \n\t" \ + "tbl z15.s, { z15.s }, z30.s \n\t" \ + "tbl z16.s, { z16.s }, z30.s \n\t" \ + "tbl z17.s, { z17.s }, z30.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +asm ( \ + "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ + "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ + "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ + "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "movprfx z18.s, p5/m, z31.s \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ + "movprfx z21.s, p5/m, z31.s \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ + "movprfx z19.s, p5/m, z31.s \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ + "movprfx z22.s, p5/m, z31.s \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ + "movprfx z20.s, p5/m, z31.s \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ + "movprfx z23.s, p5/m, z31.s \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ + "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ + "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ + "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (baseU + 2 * 3 * 64) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ +asm ( \ + "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ + "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ + "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ + "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ + "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ + "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ + "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ +asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ +asm ( \ + "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ +asm ( \ + "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ + "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ + "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ + "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ +asm ( \ + "movprfx z6.s, p5/m, z31.s \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "movprfx z7.s, p5/m, z31.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "movprfx z8.s, p5/m, z31.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "movprfx z9.s, p5/m, z31.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "movprfx z10.s, p5/m, z31.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "movprfx z11.s, p5/m, z31.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "mov z0.s, p5/m, z18.s \n\t" \ + "mov z1.s, p5/m, z19.s \n\t" \ + "mov z2.s, p5/m, z20.s \n\t" \ + "mov z3.s, p5/m, z21.s \n\t" \ + "mov z4.s, p5/m, z22.s \n\t" \ + "mov z5.s, p5/m, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ +asm ( \ + "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ + "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ + "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ +asm ( \ + "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ + "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ + "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ + "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ + "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ + "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ +asm ( \ + "ptrue p5.s \n\t" \ + "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ + "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ + "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ + "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ + "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ + "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ + "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ + "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ + "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ + "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ + "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ +asm ( \ + "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ + "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ + "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ + "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ + "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ + "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ + "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ +asm ( \ + "ptrue p5.s \n\t" \ + "fmov z0.s , 0 \n\t" \ + "fmov z1.s , 0 \n\t" \ + "fmov z2.s , 0 \n\t" \ + "fmov z3.s , 0 \n\t" \ + "fmov z4.s , 0 \n\t" \ + "fmov z5.s , 0 \n\t" \ + "fmov z6.s , 0 \n\t" \ + "fmov z7.s , 0 \n\t" \ + "fmov z8.s , 0 \n\t" \ + "fmov z9.s , 0 \n\t" \ + "fmov z10.s , 0 \n\t" \ + "fmov z11.s , 0 \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ +asm ( \ + "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ + "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + : \ + : [fetchptr] "r" (base) \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ +); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ +asm ( \ + "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ + "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ + "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ + "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ + "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ + "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ + "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ + "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ + "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ + "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ + "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ + "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h new file mode 100644 index 00000000..232610f2 --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -0,0 +1,601 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_intrin_double.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXd +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd +#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) +#define XP_PROJ XP_PROJ_A64FXd +#define YP_PROJ YP_PROJ_A64FXd +#define ZP_PROJ ZP_PROJ_A64FXd +#define TP_PROJ TP_PROJ_A64FXd +#define XM_PROJ XM_PROJ_A64FXd +#define YM_PROJ YM_PROJ_A64FXd +#define ZM_PROJ ZM_PROJ_A64FXd +#define TM_PROJ TM_PROJ_A64FXd +#define XP_RECON XP_RECON_A64FXd +#define XM_RECON XM_RECON_A64FXd +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXd; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } +#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } +// DECLARATIONS +#define DECLARATIONS_A64FXd \ + const uint64_t lut[4][8] = { \ + {4, 5, 6, 7, 0, 1, 2, 3}, \ + {2, 3, 0, 1, 6, 7, 4, 5}, \ + {1, 0, 3, 2, 5, 4, 7, 6}, \ + {0, 1, 2, 4, 5, 6, 7, 8} };\ + svfloat64_t result_00; \ + svfloat64_t result_01; \ + svfloat64_t result_02; \ + svfloat64_t result_10; \ + svfloat64_t result_11; \ + svfloat64_t result_12; \ + svfloat64_t result_20; \ + svfloat64_t result_21; \ + svfloat64_t result_22; \ + svfloat64_t result_30; \ + svfloat64_t result_31; \ + svfloat64_t result_32; \ + svfloat64_t Chi_00; \ + svfloat64_t Chi_01; \ + svfloat64_t Chi_02; \ + svfloat64_t Chi_10; \ + svfloat64_t Chi_11; \ + svfloat64_t Chi_12; \ + svfloat64_t UChi_00; \ + svfloat64_t UChi_01; \ + svfloat64_t UChi_02; \ + svfloat64_t UChi_10; \ + svfloat64_t UChi_11; \ + svfloat64_t UChi_12; \ + svfloat64_t U_00; \ + svfloat64_t U_10; \ + svfloat64_t U_20; \ + svfloat64_t U_01; \ + svfloat64_t U_11; \ + svfloat64_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b64(); \ + svuint64_t table0; \ + svfloat64_t zero0; \ + zero0 = svdup_f64(0.); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 +// RESULT +#define RESULT_A64FXd(base) \ +{ \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXd(base) \ +{ \ + Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ +{ \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXd \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint64_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint64_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint64_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint64_t*)&lut[3]); + +// PERMUTE +#define PERMUTE_A64FXd \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXd(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXd \ +{ \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXd \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXd \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXd \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXd \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXd \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXd \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXd \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXd \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXd \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXd \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXd \ + result_00 = svdup_f64(0.); \ + result_01 = svdup_f64(0.); \ + result_02 = svdup_f64(0.); \ + result_10 = svdup_f64(0.); \ + result_11 = svdup_f64(0.); \ + result_12 = svdup_f64(0.); \ + result_20 = svdup_f64(0.); \ + result_21 = svdup_f64(0.); \ + result_22 = svdup_f64(0.); \ + result_30 = svdup_f64(0.); \ + result_31 = svdup_f64(0.); \ + result_32 = svdup_f64(0.); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXd \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h new file mode 100644 index 00000000..180e5f4f --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -0,0 +1,601 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_intrin_single.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) +#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) +#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) +#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) +#define PF_GAUGE(A) +#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) +#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) +#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) +#define LOCK_GAUGE(A) +#define UNLOCK_GAUGE(A) +#define MASK_REGS DECLARATIONS_A64FXf +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) +#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf +#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) +#define XP_PROJ XP_PROJ_A64FXf +#define YP_PROJ YP_PROJ_A64FXf +#define ZP_PROJ ZP_PROJ_A64FXf +#define TP_PROJ TP_PROJ_A64FXf +#define XM_PROJ XM_PROJ_A64FXf +#define YM_PROJ YM_PROJ_A64FXf +#define ZM_PROJ ZM_PROJ_A64FXf +#define TM_PROJ TM_PROJ_A64FXf +#define XP_RECON XP_RECON_A64FXf +#define XM_RECON XM_RECON_A64FXf +#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf +#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf +#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf +#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf +#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf +#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf +#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf +#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf +#define PERMUTE_DIR0 0 +#define PERMUTE_DIR1 1 +#define PERMUTE_DIR2 2 +#define PERMUTE_DIR3 3 +#define PERMUTE PERMUTE_A64FXf; +#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } +#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } +// DECLARATIONS +#define DECLARATIONS_A64FXf \ + const uint32_t lut[4][16] = { \ + {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ + {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ + {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ + svfloat32_t result_00; \ + svfloat32_t result_01; \ + svfloat32_t result_02; \ + svfloat32_t result_10; \ + svfloat32_t result_11; \ + svfloat32_t result_12; \ + svfloat32_t result_20; \ + svfloat32_t result_21; \ + svfloat32_t result_22; \ + svfloat32_t result_30; \ + svfloat32_t result_31; \ + svfloat32_t result_32; \ + svfloat32_t Chi_00; \ + svfloat32_t Chi_01; \ + svfloat32_t Chi_02; \ + svfloat32_t Chi_10; \ + svfloat32_t Chi_11; \ + svfloat32_t Chi_12; \ + svfloat32_t UChi_00; \ + svfloat32_t UChi_01; \ + svfloat32_t UChi_02; \ + svfloat32_t UChi_10; \ + svfloat32_t UChi_11; \ + svfloat32_t UChi_12; \ + svfloat32_t U_00; \ + svfloat32_t U_10; \ + svfloat32_t U_20; \ + svfloat32_t U_01; \ + svfloat32_t U_11; \ + svfloat32_t U_21; \ + svbool_t pg1; \ + pg1 = svptrue_b32(); \ + svuint32_t table0; \ + svfloat32_t zero0; \ + zero0 = svdup_f32(0.); + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 +// RESULT +#define RESULT_A64FXf(base) \ +{ \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ + svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ +} +// PREFETCH_CHIMU_L2 (prefetch to L2) +#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ +} +// PREFETCH_CHIMU_L1 (prefetch to L1) +#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ +} +// PREFETCH_GAUGE_L2 (prefetch to L2) +#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ + svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ +} +// PREFETCH_GAUGE_L1 (prefetch to L1) +#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ + svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ +} +// LOAD_CHI +#define LOAD_CHI_A64FXf(base) \ +{ \ + Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ + Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ + Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ + Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ + Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ + Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ +} +// LOAD_CHIMU +#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ +{ \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_CHIMU_0213 +#define LOAD_CHIMU_0213_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ +} +// LOAD_CHIMU_0312 +#define LOAD_CHIMU_0312_A64FXf \ +{ \ + const SiteSpinor & ref(in[offset]); \ + Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ + Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ + Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ + Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ + Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ + Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ + Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ + Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ + Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ + Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ + Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ +} +// LOAD_TABLE0 +#define LOAD_TABLE0 \ + table0 = svld1(pg1, (uint32_t*)&lut[0]); + +// LOAD_TABLE1 +#define LOAD_TABLE1 \ + table0 = svld1(pg1, (uint32_t*)&lut[1]); + +// LOAD_TABLE2 +#define LOAD_TABLE2 \ + table0 = svld1(pg1, (uint32_t*)&lut[2]); + +// LOAD_TABLE3 +#define LOAD_TABLE3 \ + table0 = svld1(pg1, (uint32_t*)&lut[3]); + +// PERMUTE +#define PERMUTE_A64FXf \ + Chi_00 = svtbl(Chi_00, table0); \ + Chi_01 = svtbl(Chi_01, table0); \ + Chi_02 = svtbl(Chi_02, table0); \ + Chi_10 = svtbl(Chi_10, table0); \ + Chi_11 = svtbl(Chi_11, table0); \ + Chi_12 = svtbl(Chi_12, table0); + +// LOAD_GAUGE +#define LOAD_GAUGE \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +{ \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ +} +// MULT_2SPIN +#define MULT_2SPIN_1_A64FXf(A) \ +{ \ + const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ + U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ + U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ + U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ + UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ + UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ + UChi_11 = svcmla_x(pg1, zero0, U_10, Chi_10, 0); \ + UChi_02 = svcmla_x(pg1, zero0, U_20, Chi_00, 0); \ + UChi_12 = svcmla_x(pg1, zero0, U_20, Chi_10, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_00, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_10, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_00, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ + U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ + U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ + U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ +} +// MULT_2SPIN_BACKEND +#define MULT_2SPIN_2_A64FXf \ +{ \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_01, Chi_01, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_01, Chi_11, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_11, Chi_01, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_11, Chi_11, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_21, Chi_01, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_21, Chi_11, 90); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 0); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 0); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 0); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 0); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 0); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 0); \ + UChi_00 = svcmla_x(pg1, UChi_00, U_00, Chi_02, 90); \ + UChi_10 = svcmla_x(pg1, UChi_10, U_00, Chi_12, 90); \ + UChi_01 = svcmla_x(pg1, UChi_01, U_10, Chi_02, 90); \ + UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_12, 90); \ + UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_02, 90); \ + UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_12, 90); \ +} +// XP_PROJ +#define XP_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 90); \ +} +// XP_RECON +#define XP_RECON_A64FXf \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 270); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 270); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 270); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 270); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 270); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 270); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// XP_RECON_ACCUM +#define XP_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_PROJ +#define YP_PROJ_A64FXf \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_22); \ +} +// ZP_PROJ +#define ZP_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 90); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 90); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 90); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 270); \ +} +// TP_PROJ +#define TP_PROJ_A64FXf \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svadd_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svadd_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svadd_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_PROJ +#define XM_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_30, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_31, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_32, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_20, 270); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_21, 270); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_22, 270); \ +} +// XM_RECON +#define XM_RECON_A64FXf \ + result_20 = svcadd_x(pg1, zero0, UChi_10, 90); \ + result_21 = svcadd_x(pg1, zero0, UChi_11, 90); \ + result_22 = svcadd_x(pg1, zero0, UChi_12, 90); \ + result_30 = svcadd_x(pg1, zero0, UChi_00, 90); \ + result_31 = svcadd_x(pg1, zero0, UChi_01, 90); \ + result_32 = svcadd_x(pg1, zero0, UChi_02, 90); \ + result_00 = UChi_00; \ + result_01 = UChi_01; \ + result_02 = UChi_02; \ + result_10 = UChi_10; \ + result_11 = UChi_11; \ + result_12 = UChi_12; + +// YM_PROJ +#define YM_PROJ_A64FXf \ +{ \ + Chi_00 = svadd_x(pg1, Chimu_00, Chimu_30); \ + Chi_01 = svadd_x(pg1, Chimu_01, Chimu_31); \ + Chi_02 = svadd_x(pg1, Chimu_02, Chimu_32); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_20); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_21); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_22); \ +} +// ZM_PROJ +#define ZM_PROJ_A64FXf \ +{ \ + Chi_00 = svcadd_x(pg1, Chimu_00, Chimu_20, 270); \ + Chi_01 = svcadd_x(pg1, Chimu_01, Chimu_21, 270); \ + Chi_02 = svcadd_x(pg1, Chimu_02, Chimu_22, 270); \ + Chi_10 = svcadd_x(pg1, Chimu_10, Chimu_30, 90); \ + Chi_11 = svcadd_x(pg1, Chimu_11, Chimu_31, 90); \ + Chi_12 = svcadd_x(pg1, Chimu_12, Chimu_32, 90); \ +} +// TM_PROJ +#define TM_PROJ_A64FXf \ +{ \ + Chi_00 = svsub_x(pg1, Chimu_00, Chimu_20); \ + Chi_01 = svsub_x(pg1, Chimu_01, Chimu_21); \ + Chi_02 = svsub_x(pg1, Chimu_02, Chimu_22); \ + Chi_10 = svsub_x(pg1, Chimu_10, Chimu_30); \ + Chi_11 = svsub_x(pg1, Chimu_11, Chimu_31); \ + Chi_12 = svsub_x(pg1, Chimu_12, Chimu_32); \ +} +// XM_RECON_ACCUM +#define XM_RECON_ACCUM_A64FXf \ + result_30 = svcadd_x(pg1, result_30, UChi_00, 90); \ + result_31 = svcadd_x(pg1, result_31, UChi_01, 90); \ + result_32 = svcadd_x(pg1, result_32, UChi_02, 90); \ + result_20 = svcadd_x(pg1, result_20, UChi_10, 90); \ + result_21 = svcadd_x(pg1, result_21, UChi_11, 90); \ + result_22 = svcadd_x(pg1, result_22, UChi_12, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// YP_RECON_ACCUM +#define YP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svsub_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svsub_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svsub_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svadd_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svadd_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svadd_x(pg1, result_22, UChi_12); + +// YM_RECON_ACCUM +#define YM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_30 = svadd_x(pg1, result_30, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_31 = svadd_x(pg1, result_31, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_32 = svadd_x(pg1, result_32, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_20 = svsub_x(pg1, result_20, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_21 = svsub_x(pg1, result_21, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_22 = svsub_x(pg1, result_22, UChi_12); + +// ZP_RECON_ACCUM +#define ZP_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 270); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 270); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 270); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 90); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 90); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 90); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// ZM_RECON_ACCUM +#define ZM_RECON_ACCUM_A64FXf \ + result_20 = svcadd_x(pg1, result_20, UChi_00, 90); \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_21 = svcadd_x(pg1, result_21, UChi_01, 90); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_22 = svcadd_x(pg1, result_22, UChi_02, 90); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_30 = svcadd_x(pg1, result_30, UChi_10, 270); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_31 = svcadd_x(pg1, result_31, UChi_11, 270); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_32 = svcadd_x(pg1, result_32, UChi_12, 270); \ + result_12 = svadd_x(pg1, result_12, UChi_12); + +// TP_RECON_ACCUM +#define TP_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svadd_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svadd_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svadd_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svadd_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svadd_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svadd_x(pg1, result_32, UChi_12); + +// TM_RECON_ACCUM +#define TM_RECON_ACCUM_A64FXf \ + result_00 = svadd_x(pg1, result_00, UChi_00); \ + result_20 = svsub_x(pg1, result_20, UChi_00); \ + result_01 = svadd_x(pg1, result_01, UChi_01); \ + result_21 = svsub_x(pg1, result_21, UChi_01); \ + result_02 = svadd_x(pg1, result_02, UChi_02); \ + result_22 = svsub_x(pg1, result_22, UChi_02); \ + result_10 = svadd_x(pg1, result_10, UChi_10); \ + result_30 = svsub_x(pg1, result_30, UChi_10); \ + result_11 = svadd_x(pg1, result_11, UChi_11); \ + result_31 = svsub_x(pg1, result_31, UChi_11); \ + result_12 = svadd_x(pg1, result_12, UChi_12); \ + result_32 = svsub_x(pg1, result_32, UChi_12); + +// ZERO_PSI +#define ZERO_PSI_A64FXf \ + result_00 = svdup_f32(0.); \ + result_01 = svdup_f32(0.); \ + result_02 = svdup_f32(0.); \ + result_10 = svdup_f32(0.); \ + result_11 = svdup_f32(0.); \ + result_12 = svdup_f32(0.); \ + result_20 = svdup_f32(0.); \ + result_21 = svdup_f32(0.); \ + result_22 = svdup_f32(0.); \ + result_30 = svdup_f32(0.); \ + result_31 = svdup_f32(0.); \ + result_32 = svdup_f32(0.); + +// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ +} +// PREFETCH_RESULT_L1_STORE (prefetch store to L1) +#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ +{ \ + svprfd(pg1, (int64_t*)(base + 0), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 256), SV_PSTL1STRM); \ + svprfd(pg1, (int64_t*)(base + 512), SV_PSTL1STRM); \ +} +// ADD_RESULT_INTERNAL +#define ADD_RESULT_INTERNAL_A64FXf \ + result_00 = svadd_x(pg1, result_00, Chimu_00); \ + result_01 = svadd_x(pg1, result_01, Chimu_01); \ + result_02 = svadd_x(pg1, result_02, Chimu_02); \ + result_10 = svadd_x(pg1, result_10, Chimu_10); \ + result_11 = svadd_x(pg1, result_11, Chimu_11); \ + result_12 = svadd_x(pg1, result_12, Chimu_12); \ + result_20 = svadd_x(pg1, result_20, Chimu_20); \ + result_21 = svadd_x(pg1, result_21, Chimu_21); \ + result_22 = svadd_x(pg1, result_22, Chimu_22); \ + result_30 = svadd_x(pg1, result_30, Chimu_30); \ + result_31 = svadd_x(pg1, result_31, Chimu_31); \ + result_32 = svadd_x(pg1, result_32, Chimu_32); + diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h new file mode 100644 index 00000000..81eec37a --- /dev/null +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -0,0 +1,76 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Fujitsu_A64FX_undef.h + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#undef LOAD_CHIMU +#undef PREFETCH_CHIMU_L1 +#undef PREFETCH_GAUGE_L1 +#undef PREFETCH_CHIMU_L2 +#undef PREFETCH_GAUGE_L2 +#undef PREFETCH_GAUGE_L1_INTERNAL +#undef PREFETCH1_CHIMU +#undef PREFETCH_CHIMU +#undef PREFETCH_RESULT_L2_STORE +#undef PREFETCH_RESULT_L1_STORE +#undef LOAD_GAUGE +#undef LOCK_GAUGE +#undef UNLOCK_GAUGE +#undef MASK_REGS +#undef SAVE_RESULT +#undef ADD_RESULT +#undef MULT_2SPIN_1 +#undef MULT_2SPIN_2 +#undef MAYBEPERM +#undef LOAD_CHI +#undef XP_PROJ +#undef YP_PROJ +#undef ZP_PROJ +#undef TP_PROJ +#undef XM_PROJ +#undef YM_PROJ +#undef ZM_PROJ +#undef TM_PROJ +#undef XP_RECON +#undef XM_RECON +#undef XM_RECON_ACCUM +#undef YM_RECON_ACCUM +#undef ZM_RECON_ACCUM +#undef TM_RECON_ACCUM +#undef XP_RECON_ACCUM +#undef YP_RECON_ACCUM +#undef ZP_RECON_ACCUM +#undef TP_RECON_ACCUM +#undef PERMUTE +#undef PERMUTE_DIR0 +#undef PERMUTE_DIR1 +#undef PERMUTE_DIR2 +#undef PERMUTE_DIR3 +#undef LOAD_TABLE +#undef LOAD_TABLE0 +#undef LOAD_TABLE1 +#undef LOAD_TABLE2 +#undef LOAD_TABLE3 diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h new file mode 100644 index 00000000..2ad8591c --- /dev/null +++ b/Grid/simd/Grid_a64fx-2.h @@ -0,0 +1,942 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid_a64fx-2.h + + Copyright (C) 2020 + + Author: Nils Meyer + + with support from Arm + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE +///////////////////////////////////////////////////// + +static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes"); + +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + + // type traits giving the number of elements for each vector type + template struct W; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/8u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int r = GEN_SIMD_WIDTH/4u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/4u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/2u; + }; + template <> struct W { + constexpr static unsigned int c = GEN_SIMD_WIDTH/16u; + constexpr static unsigned int r = GEN_SIMD_WIDTH/8u; + }; + + #ifdef ARMCLANGCOMPAT + // SIMD vector immediate types + template + struct vec_imm { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + vec() = default; + vec(const vec &rhs) { this->operator=(rhs); } + vec(const vec_imm &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + } + + inline vec &operator=(const vec &rhs) { + // v = rhs.v + svst1(svptrue_b8(), (T*)this, svld1(svptrue_b8(), (T*)rhs.v)); + return *this; + }; + }; + + #else // no ARMCLANGCOMPAT + #define vec_imm vec + // SIMD vector types + template + struct vec { + alignas(GEN_SIMD_WIDTH) T v[W::r]; + }; + #endif + + typedef vec vecf; + typedef vec vecd; + typedef vec vech; // half precision comms + typedef vec veci; + +NAMESPACE_END(Optimization) +NAMESPACE_END(Grid) + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + +template +struct acle{}; + +template <> +struct acle{ + typedef svfloat64_t vt; + typedef svfloat64x2_t vt2; + typedef svfloat64x4_t vt4; + typedef float64_t pt; + typedef uint64_t uint; + typedef svuint64_t svuint; + + static inline svbool_t pg1(){return svptrue_b64();} + static inline svbool_t pg2(){return svptrue_pat_b64(SV_VL4);} + static inline svbool_t pg4(){return svptrue_pat_b64(SV_VL2);} + static inline vec tbl_swap(){ + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6}; + return t; + } + static inline vec tbl0(){ + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3}; + return t; + } + static inline vec tbl1(){ + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + //const vec t = {0, 1, 4, 5, 2, 3, 6, 7}; + const vec_imm t = {0, 1, 4, 5, 2, 3, 6, 7}; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + //const vec t = {2, 3, 6, 7, 0, 1, 4, 5}; + const vec_imm t = {2, 3, 6, 7, 0, 1, 4, 5}; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + //const vec t = {4, 5, 0, 1, 6, 7, 2, 3}; + const vec_imm t = {4, 5, 0, 1, 6, 7, 2, 3}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline svfloat64_t zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + typedef svfloat32_t vt; + typedef svfloat32x2_t vt2; + typedef float32_t pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + // exchange neighboring elements + static inline vec tbl_swap(){ + //const vec t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + const vec_imm t = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + return t; + } + static inline vec tbl0(){ + //const vec t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + return t; + } + static inline vec tbl1(){ + //const vec t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + const vec_imm t = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + return t; + } + static inline vec tbl2(){ + //const vec t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + const vec_imm t = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; + return t; + } + static inline vec tbl_exch1a(){ // Exchange1 + //const vec t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + const vec_imm t = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 }; + return t; + } + static inline vec tbl_exch1b(){ // Exchange1 + //const vec t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + const vec_imm t = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 }; + return t; + } + static inline vec tbl_exch1c(){ // Exchange1 + //const vec t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + const vec_imm t = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7}; + return t; + } + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline svfloat32_t zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + typedef svfloat16_t vt; + typedef float16_t pt; + typedef uint16_t uint; + typedef svuint16_t svuint; + + static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg2(){return svptrue_pat_b16(SV_VL16);} + static inline svbool_t pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline svfloat16_t zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + typedef svuint32_t vt; + typedef svuint32x2_t vt2; + typedef Integer pt; + typedef uint32_t uint; + typedef svuint32_t svuint; + + //static inline svbool_t pg1(){return svptrue_b16();} + static inline svbool_t pg1(){return svptrue_b32();} + static inline svbool_t pg2(){return svptrue_pat_b32(SV_VL8);} + static inline svbool_t pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline svbool_t pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f32(a); + typename acle::vt b_v = svdup_f32(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real float + inline vecf operator()(float a){ + vecf out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f32(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Complex double + inline vecd operator()(double a, double b){ + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svdup_f64(a); + typename acle::vt b_v = svdup_f64(b); + typename acle::vt r_v = svzip1(a_v, b_v); + svst1(pg1, out.v, r_v); + return out; + } + + // Real double + inline vecd operator()(double a){ + vecd out; + svbool_t pg1 = acle::pg1(); + typename acle::vt r_v = svdup_f64(a); + svst1(pg1, out.v, r_v); + return out; + } + + // Integer + inline vec operator()(Integer a){ + vec out; + svbool_t pg1 = acle::pg1(); + // Add check whether Integer is really a uint32_t??? + typename acle::vt r_v = svdup_u32(a); + svst1(pg1, out.v, r_v); + return out; + } +}; + +struct Vstore{ + // Real + template + inline void operator()(vec a, T *D){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (typename acle::pt*)&a.v); + svst1(pg1, D, a_v); + } +}; + +struct Vstream{ + // Real + template + inline void operator()(T * a, vec b){ + svbool_t pg1 = acle::pg1(); + typename acle::vt b_v = svld1(pg1, b.v); + svstnt1(pg1, a, b_v); + //svst1(pg1, a, b_v); + } +}; + + struct Vset{ + // Complex + template + inline vec operator()(std::complex *a){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, (T*)a); + svst1(pg1, out.v, a_v); + + return out; + } + + // Real + template + inline vec operator()(T *a){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a); + svst1(pg1, out.v, a_v); + + return out; + } + }; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// + +struct Sum{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svadd_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Sub{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svsub_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Mult{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + typename acle::vt r_v = svmla_x(pg1, c_v, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svmul_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + + // using FCMLA + typename acle::vt z_v = acle::zero(); + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MaddRealPart{ + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultComplex{ + // Complex a*b + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt z_v = acle::zero(); + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct MultAddComplex{ + // Complex a*b+c + template + inline vec operator()(vec a, vec b, vec c){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt c_v = svld1(pg1, c.v);; + + // using FCMLA + typename acle::vt r_v = svcmla_x(pg1, c_v, a_v, b_v, 0); + r_v = svcmla_x(pg1, r_v, a_v, b_v, 90); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Div{ + // Real + template + inline vec operator()(vec a, vec b){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, a.v); + typename acle::vt b_v = svld1(pg1, b.v); + typename acle::vt r_v = svdiv_x(pg1, a_v, b_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct Conj{ + // Complex + template + inline vec operator()(vec a){ + vec out; + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, a.v); + //typename acle::vt r_v = svneg_x(pg_odd, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct TimesMinusI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_odd = acle::pg_odd(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + typename acle::vt r_v = svneg_m(a_v, pg_odd, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct TimesI{ + // Complex + template + inline vec operator()(vec a, vec b){ + vec out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt a_v = svld1(pg1, a.v); + a_v = svtbl(a_v, tbl_swap_v); + //typename acle::vt r_v = svneg_x(pg_even, a_v); + typename acle::vt r_v = svneg_m(a_v, pg_even, a_v); + svst1(pg1, out.v, r_v); + + return out; + } +}; + +struct PrecisionChange { + static inline vech StoH (const vecf &sa,const vecf &sb) { + vech ret; + svbool_t pg1s = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt sa_v = svld1(pg1s, sa.v); + typename acle::vt sb_v = svld1(pg1s, sb.v); + typename acle::vt ha_v = svcvt_f16_x(pg1s, sa_v); + typename acle::vt hb_v = svcvt_f16_x(pg1s, sb_v); + typename acle::vt r_v = svuzp1(ha_v, hb_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + svbool_t pg1h = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt ha_v = svzip1(h_v, h_v); + typename acle::vt hb_v = svzip2(h_v, h_v); + typename acle::vt sa_v = svcvt_f32_x(pg1s, ha_v); + typename acle::vt sb_v = svcvt_f32_x(pg1s, hb_v); + svst1(pg1s, sa.v, sa_v); + svst1(pg1s, sb.v, sb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + vecf ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1s = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt sa_v = svcvt_f32_x(pg1d, a_v); + typename acle::vt sb_v = svcvt_f32_x(pg1d, b_v); + typename acle::vt r_v = svuzp1(sa_v, sb_v); + svst1(pg1s, ret.v, r_v); + + return ret; + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + svbool_t pg1s = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt s_v = svld1(pg1s, s.v); + typename acle::vt sa_v = svzip1(s_v, s_v); + typename acle::vt sb_v = svzip2(s_v, s_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, sa_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, sb_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + vech ret; + svbool_t pg1d = acle::pg1(); + svbool_t pg1h = acle::pg1(); + typename acle::vt a_v = svld1(pg1d, a.v); + typename acle::vt b_v = svld1(pg1d, b.v); + typename acle::vt c_v = svld1(pg1d, c.v); + typename acle::vt d_v = svld1(pg1d, d.v); + typename acle::vt ha_v = svcvt_f16_x(pg1d, a_v); + typename acle::vt hb_v = svcvt_f16_x(pg1d, b_v); + typename acle::vt hc_v = svcvt_f16_x(pg1d, c_v); + typename acle::vt hd_v = svcvt_f16_x(pg1d, d_v); + typename acle::vt hab_v = svuzp1(ha_v, hb_v); + typename acle::vt hcd_v = svuzp1(hc_v, hd_v); + typename acle::vt r_v = svuzp1(hab_v, hcd_v); + svst1(pg1h, (typename acle::pt*)&ret.v, r_v); + + return ret; +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + svbool_t pg1h = acle::pg1(); + svbool_t pg1d = acle::pg1(); + typename acle::vt h_v = svld1(pg1h, (typename acle::pt*)&h.v); + typename acle::vt sa_v = svzip1(h_v, h_v); + typename acle::vt sb_v = svzip2(h_v, h_v); + typename acle::vt da_v = svzip1(sa_v, sa_v); + typename acle::vt db_v = svzip2(sa_v, sa_v); + typename acle::vt dc_v = svzip1(sb_v, sb_v); + typename acle::vt dd_v = svzip2(sb_v, sb_v); + typename acle::vt a_v = svcvt_f64_x(pg1d, da_v); + typename acle::vt b_v = svcvt_f64_x(pg1d, db_v); + typename acle::vt c_v = svcvt_f64_x(pg1d, dc_v); + typename acle::vt d_v = svcvt_f64_x(pg1d, dd_v); + svst1(pg1d, a.v, a_v); + svst1(pg1d, b.v, b_v); + svst1(pg1d, c.v, c_v); + svst1(pg1d, d.v, d_v); +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + +struct Exchange{ + + // Exchange0 is valid for arbitrary SVE vector length + template + static inline void Exchange0(vec &out1, vec &out2, const vec &in1, const vec &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svext(a1_v, a1_v, (uint64_t)W::c); + r1_v = svext(r1_v, a2_v, (uint64_t)W::c); + typename acle::vt r2_v = svext(a2_v, a2_v, (uint64_t)W::c); + r2_v = svext(a1_v, r2_v, (uint64_t)W::c); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + template + static inline void Exchange1(vec &out1, vec &out2, const vec &in1, const vec &in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into ldp + stp -> SFI + svbool_t pg1 = acle::pg1(); + const vec::uint> tbl_exch1a = acle::tbl_exch1a(); + const vec::uint> tbl_exch1b = acle::tbl_exch1b(); + const vec::uint> tbl_exch1c = acle::tbl_exch1c(); + + typename acle::svuint tbl_exch1a_v = svld1(pg1, tbl_exch1a.v); + typename acle::svuint tbl_exch1b_v = svld1(pg1, tbl_exch1b.v); + typename acle::svuint tbl_exch1c_v = svld1(pg1, tbl_exch1c.v); + + typename acle::vt in1_v = svld1(pg1, in1.v); + typename acle::vt in2_v = svld1(pg1, in2.v); + + typename acle::vt a1_v = svtbl(in1_v, tbl_exch1a_v); + typename acle::vt a2_v = svtbl(in2_v, tbl_exch1b_v); + typename acle::vt b1_v = svext(a2_v, a1_v, (uint64_t)(W::r / 2u)); + typename acle::vt b2_v = svext(a1_v, a2_v, (uint64_t)(W::r / 2u)); + typename acle::vt out1_v = svtbl(b1_v, tbl_exch1c_v); + typename acle::vt out2_v = svtbl(b2_v, tbl_exch1a_v); + + svst1(pg1, out1.v, out1_v); + svst1(pg1, out2.v, out2_v); + } + + template + static inline void Exchange2(vec &out1, vec &out2, const vec &in1, const vec &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, (typename acle::pt*)in1.v); + typename acle::vt a2_v = svld1(pg1, (typename acle::pt*)in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, (typename acle::pt*)out1.v, r1_v); + svst1(pg1, (typename acle::pt*)out2.v, r2_v); + } + + static inline void Exchange3(vecf &out1, vecf &out2, const vecf &in1, const vecf &in2){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a1_v = svld1(pg1, in1.v); + typename acle::vt a2_v = svld1(pg1, in2.v); + typename acle::vt r1_v = svtrn1(a1_v, a2_v); + typename acle::vt r2_v = svtrn2(a1_v, a2_v); + svst1(pg1, out1.v, r1_v); + svst1(pg1, out2.v, r2_v); + } + + static inline void Exchange3(vecd &out1, vecd &out2, const vecd &in1, const vecd &in2){ + assert(0); + return; + } +}; + +struct Permute{ + + // Permute0 is valid for any SVE vector width + template + static inline vec Permute0(vec in) { + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(W::r / 2u)); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute1(vecd in) { + vecd out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute1(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl1(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute2(vecd in) { + vecd out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute2(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl2(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecf Permute3(vecf in) { + vecf out; + const vec::uint> tbl_swap = acle::tbl_swap(); + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::svuint tbl_swap_v = svld1(pg1, tbl_swap.v); + typename acle::vt r_v = svtbl(a_v, tbl_swap_v); + svst1(pg1, out.v, r_v); + + return out; + } + + static inline vecd Permute3(vecd in) { + return in; + } + +}; + +struct Rotate{ + + template static inline vec tRotate(vec in){ + vec out; + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + typename acle::vt r_v = svext(a_v, a_v, (uint64_t)(n%W::r)); + svst1(pg1, out.v, r_v); + + return out; + } + + template + static inline vec rotate(vec in, int n){ + + switch(n){ + case 0: return tRotate<0, T>(in); break; + case 1: return tRotate<1, T>(in); break; + case 2: return tRotate<2, T>(in); break; + case 3: return tRotate<3, T>(in); break; + case 4: return tRotate<4, T>(in); break; + case 5: return tRotate<5, T>(in); break; + case 6: return tRotate<6, T>(in); break; + case 7: return tRotate<7, T>(in); break; + + case 8: return tRotate<8, T>(in); break; + case 9: return tRotate<9, T>(in); break; + case 10: return tRotate<10, T>(in); break; + case 11: return tRotate<11, T>(in); break; + case 12: return tRotate<12, T>(in); break; + case 13: return tRotate<13, T>(in); break; + case 14: return tRotate<14, T>(in); break; + case 15: return tRotate<15, T>(in); break; + default: assert(0); + } + } +}; + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } +}; + +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg_even, a_v); + float b = svred(pg_odd, a_v); + + return Grid::ComplexF(a, b); + +} + +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + float a = svred(pg1, a_v); + + return a; +} + +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + svbool_t pg1 = acle::pg1(); + svbool_t pg_even = acle::pg_even(); + svbool_t pg_odd = acle::pg_odd(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg_even, a_v); + double b = svred(pg_odd, a_v); + + return Grid::ComplexD(a, b); +} + +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + double a = svred(pg1, a_v); + + return a; +} + +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + svbool_t pg1 = acle::pg1(); + typename acle::vt a_v = svld1(pg1, in.v); + Integer a = svred(pg1, a_v); + + return a; +} + +#undef svred +#undef vec_imm + +NAMESPACE_END(Optimization) + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + +typedef Optimization::vech SIMD_Htype; // Reduced precision type +typedef Optimization::vecf SIMD_Ftype; // Single precision type +typedef Optimization::vecd SIMD_Dtype; // Double precision type +typedef Optimization::veci SIMD_Itype; // Integer type + +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid) diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h new file mode 100644 index 00000000..6b450012 --- /dev/null +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -0,0 +1,769 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid_a64fx-fixedsize.h + + Copyright (C) 2020 + + Author: Nils Meyer Regensburg University + + with support from Arm + Richard Sandiford + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +///////////////////////////////////////////////////// +// Using SVE ACLE with fixed-size data types +///////////////////////////////////////////////////// + + +// gcc 10 features +#if __ARM_FEATURE_SVE_BITS==512 +/* gcc 10.0.1 and gcc 10.1 bug using ACLE data types CAS-159553-Y1K4C6 + workaround: use gcc's internal data types, bugfix expected for gcc 10.2 +typedef svbool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef svfloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef svfloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef svfloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef svuint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef svuint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double +*/ +typedef __SVBool_t pred __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat16_t vech __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat32_t vecf __attribute__((arm_sve_vector_bits(512))); +typedef __SVFloat64_t vecd __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t veci __attribute__((arm_sve_vector_bits(512))); +typedef __SVUint32_t lutf __attribute__((arm_sve_vector_bits(512))); // LUTs for float +typedef __SVUint64_t lutd __attribute__((arm_sve_vector_bits(512))); // LUTs for double +#else +#pragma error("Oops. Illegal SVE vector size!?") +#endif /* __ARM_FEATURE_SVE_BITS */ + +// low-level API +NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Optimization); + +// convenience union types for tables eliminating loads +union ulutf { + lutf v; + uint32_t s[16]; +}; +union ulutd { + lutd v; + uint64_t s[8]; +}; + +template +struct acle{}; + +template <> +struct acle{ + static inline lutd tbl_swap(){ + const ulutd t = { .s = {1, 0, 3, 2, 5, 4, 7, 6} }; + return t.v; + } + static inline lutd tbl0(){ + const ulutd t = { .s = {4, 5, 6, 7, 0, 1, 2, 3} }; + return t.v; + } + static inline lutd tbl1(){ + const ulutd t = { .s = {2, 3, 0, 1, 6, 7, 4, 5} }; + return t.v; + } + static inline lutd tbl_exch1a(){ // Exchange1 + const ulutd t = { .s = {0, 1, 4, 5, 2, 3, 6, 7} }; + return t.v; + } + static inline lutd tbl_exch1b(){ // Exchange1 + const ulutd t = { .s = {2, 3, 6, 7, 0, 1, 4, 5} }; + return t.v; + } + static inline lutd tbl_exch1c(){ // Exchange1 + const ulutd t = { .s = {4, 5, 0, 1, 6, 7, 2, 3} }; + return t.v; + } + static inline pred pg1(){return svptrue_b64();} + static inline pred pg_even(){return svzip1_b64(svptrue_b64(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b64(svpfalse_b(), svptrue_b64());} + static inline vecd zero(){return svdup_f64(0.);} +}; + +template <> +struct acle{ + // exchange neighboring elements + static inline lutf tbl_swap(){ + const ulutf t = { .s = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; + return t.v; + } + static inline lutf tbl0(){ + const ulutf t = { .s = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7} }; + return t.v; + } + static inline lutf tbl1(){ + const ulutf t = { .s = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11} }; + return t.v; + } + static inline lutf tbl2(){ + const ulutf t = { .s = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13} }; + return t.v; + } + static inline lutf tbl_exch1a(){ // Exchange1 + const ulutf t = { .s = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 } }; + return t.v; + } + static inline lutf tbl_exch1b(){ // Exchange1 + const ulutf t = { .s = {4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11 } }; + return t.v; + } + static inline lutf tbl_exch1c(){ // Exchange1 + const ulutf t = { .s = {8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7} }; + return t.v; + } + static inline pred pg1(){return svptrue_b32();} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} + static inline vecf zero(){return svdup_f32(0.);} +}; + +template <> +struct acle{ + static inline pred pg1(){return svptrue_b16();} + static inline pred pg_even(){return svzip1_b16(svptrue_b16(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b16(svpfalse_b(), svptrue_b16());} + static inline vech zero(){return svdup_f16(0.);} +}; + +template <> +struct acle{ + //static inline svbool_t pg1(){return svptrue_b16();} + static inline pred pg1(){return svptrue_b32();} + static inline pred pg_even(){return svzip1_b32(svptrue_b32(), svpfalse_b());} + static inline pred pg_odd() {return svzip1_b32(svpfalse_b(), svptrue_b32());} +}; + +// --------------------------------------------------- + +struct Vsplat{ + // Complex float + inline vecf operator()(float a, float b){ + vecf a_v = svdup_f32(a); + vecf b_v = svdup_f32(b); + return svzip1(a_v, b_v); + } + // Real float + inline vecf operator()(float a){ + return svdup_f32(a); + } + // Complex double + inline vecd operator()(double a, double b){ + vecd a_v = svdup_f64(a); + vecd b_v = svdup_f64(b); + return svzip1(a_v, b_v); + } + // Real double + inline vecd operator()(double a){ + return svdup_f64(a); + } + // Integer + inline veci operator()(Integer a){ + return svdup_u32(a); + } +}; + +struct Vstore{ + // Real float + inline void operator()(vecf a, float *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } + // Real double + inline void operator()(vecd a, double *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } + // Real float + inline void operator()(veci a, Integer *D){ + pred pg1 = acle::pg1(); + svst1(pg1, D, a); + } +}; + +struct Vstream{ + // Real float + inline void operator()(float * a, vecf b){ + pred pg1 = acle::pg1(); + svstnt1(pg1, a, b); + //svst1(pg1, a, b); + } + // Real double + inline void operator()(double * a, vecd b){ + pred pg1 = acle::pg1(); + svstnt1(pg1, a, b); + //svst1(pg1, a, b); + } +}; + +struct Vset{ + // Complex float + inline vecf operator()(Grid::ComplexF *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (float*)a); + } + // Complex double + inline vecd operator()(Grid::ComplexD *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, (double*)a); + } + // Real float + inline vecf operator()(float *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Real double + inline vecd operator()(double *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } + // Integer + inline veci operator()(Integer *a){ + pred pg1 = acle::pg1(); + return svld1(pg1, a); + } +}; + +///////////////////////////////////////////////////// +// Arithmetic operations +///////////////////////////////////////////////////// + +struct Sum{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svadd_x(pg1, a, b); + } +}; + +struct Sub{ + // Complex/real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + // Complex/real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svsub_x(pg1, a, b); + } + +}; + +struct Mult{ + // Real float fma + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + return svmad_x(pg1, b, c, a); + } + // Real double fma + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + return svmad_x(pg1, b, c, a); + } + // Real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } + // Real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } + // Integer + inline veci operator()(veci a, veci b){ + pred pg1 = acle::pg1(); + return svmul_x(pg1, a, b); + } +}; + +struct MultRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + // using FCMLA + vecd z_v = acle::zero(); + return svcmla_x(pg1, z_v, a, b, 0); + } +}; + +struct MaddRealPart{ + // Complex float + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + // using FCMLA + return svcmla_x(pg1, c, a, b, 0); + } + // Complex double + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + // using FCMLA + return svcmla_x(pg1, c, a, b, 0); + } +}; + +struct MultComplex{ + // Complex a*b + // Complex float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + vecf z = acle::zero(); + // using FCMLA + vecf r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + vecd z = acle::zero(); + // using FCMLA + vecd r_v = svcmla_x(pg1, z, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } +}; + +struct MultAddComplex{ + // Complex a*b+c + // Complex float + inline vecf operator()(vecf a, vecf b, vecf c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecf r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } + // Complex double + inline vecd operator()(vecd a, vecd b, vecd c){ + pred pg1 = acle::pg1(); + // using FCMLA + vecd r_v = svcmla_x(pg1, c, a, b, 0); + return svcmla_x(pg1, r_v, a, b, 90); + } +}; + +struct Div{ + // Real float + inline vecf operator()(vecf a, vecf b){ + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); + } + // Real double + inline vecd operator()(vecd a, vecd b){ + pred pg1 = acle::pg1(); + return svdiv_x(pg1, a, b); + } +}; + +struct Conj{ + // Complex float + inline vecf operator()(vecf a){ + pred pg_odd = acle::pg_odd(); + //return svneg_x(pg_odd, a); this is unsafe + return svneg_m(a, pg_odd, a); + } + // Complex double + inline vecd operator()(vecd a){ + pred pg_odd = acle::pg_odd(); + //return svneg_x(pg_odd, a); this is unsafe + return svneg_m(a, pg_odd, a); + } +}; + +struct TimesMinusI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + lutf tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); + + vecf a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_odd, a_v); this is unsafe + return svneg_m(a_v, pg_odd, a_v); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + lutd tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_odd = acle::pg_odd(); + + vecd a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_odd, a_v); this is unsafe + return svneg_m(a_v, pg_odd, a_v); + } +}; + +struct TimesI{ + // Complex float + inline vecf operator()(vecf a, vecf b){ + lutf tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); + + vecf a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_even, a_v); this is unsafe + return svneg_m(a_v, pg_even, a_v); + } + // Complex double + inline vecd operator()(vecd a, vecd b){ + lutd tbl_swap = acle::tbl_swap(); + pred pg1 = acle::pg1(); + pred pg_even = acle::pg_even(); + + vecd a_v = svtbl(a, tbl_swap); + //return svneg_x(pg_even, a_v); this is unsafe + return svneg_m(a_v, pg_even, a_v); + } +}; + +struct PrecisionChange { + static inline vech StoH (vecf sa, vecf sb) { + pred pg1s = acle::pg1(); + vech ha_v = svcvt_f16_x(pg1s, sa); + vech hb_v = svcvt_f16_x(pg1s, sb); + return svuzp1(ha_v, hb_v); + } + static inline void HtoS(vech h,vecf &sa,vecf &sb) { + pred pg1s = acle::pg1(); + vech ha_v = svzip1(h, h); + vech hb_v = svzip2(h, h); + sa = svcvt_f32_x(pg1s, ha_v); + sb = svcvt_f32_x(pg1s, hb_v); + } + static inline vecf DtoS (vecd a,vecd b) { + pred pg1d = acle::pg1(); + vecf sa_v = svcvt_f32_x(pg1d, a); + vecf sb_v = svcvt_f32_x(pg1d, b); + return svuzp1(sa_v, sb_v); + } + static inline void StoD (vecf s,vecd &a,vecd &b) { + pred pg1d = acle::pg1(); + vecf sa_v = svzip1(s, s); + vecf sb_v = svzip2(s, s); + a = svcvt_f64_x(pg1d, sa_v); + b = svcvt_f64_x(pg1d, sb_v); + } + static inline vech DtoH (vecd a,vecd b,vecd c,vecd d) { + pred pg1d = acle::pg1(); + pred pg1h = acle::pg1(); + vech ha_v = svcvt_f16_x(pg1d, a); + vech hb_v = svcvt_f16_x(pg1d, b); + vech hc_v = svcvt_f16_x(pg1d, c); + vech hd_v = svcvt_f16_x(pg1d, d); + vech hab_v = svuzp1(ha_v, hb_v); + vech hcd_v = svuzp1(hc_v, hd_v); + return svuzp1(hab_v, hcd_v); + +/* + vecf sa,sb; + sa = DtoS(a,b); + sb = DtoS(c,d); + return StoH(sa,sb); +*/ + } + static inline void HtoD(vech h,vecd &a,vecd &b,vecd &c,vecd &d) { + pred pg1h = acle::pg1(); + pred pg1d = acle::pg1(); + vech sa_v = svzip1(h, h); + vech sb_v = svzip2(h, h); + vech da_v = svzip1(sa_v, sa_v); + vech db_v = svzip2(sa_v, sa_v); + vech dc_v = svzip1(sb_v, sb_v); + vech dd_v = svzip2(sb_v, sb_v); + a = svcvt_f64_x(pg1d, da_v); + b = svcvt_f64_x(pg1d, db_v); + c = svcvt_f64_x(pg1d, dc_v); + d = svcvt_f64_x(pg1d, dd_v); + +/* + vecf sa,sb; + HtoS(h,sa,sb); + StoD(sa,a,b); + StoD(sb,c,d); +*/ + } +}; + +struct Exchange{ + // float + static inline void Exchange0(vecf &out1, vecf &out2, vecf in1, vecf in2){ + vecf r1_v = svext(in1, in1, (uint64_t)8u); + vecf r2_v = svext(in2, in2, (uint64_t)8u); + out1 = svext(r1_v, in2, (uint64_t)8u); + out2 = svext(in1, r2_v, (uint64_t)8u); + } + static inline void Exchange1(vecf &out1, vecf &out2, vecf in1, vecf in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI + lutf tbl_exch1a = acle::tbl_exch1a(); + lutf tbl_exch1b = acle::tbl_exch1b(); + lutf tbl_exch1c = acle::tbl_exch1c(); + + vecf a1_v = svtbl(in1, tbl_exch1a); + vecf a2_v = svtbl(in2, tbl_exch1b); + vecf b1_v = svext(a2_v, a1_v, (uint64_t)8u); + vecf b2_v = svext(a1_v, a2_v, (uint64_t)8u); + out1 = svtbl(b1_v, tbl_exch1c); + out2 = svtbl(b2_v, tbl_exch1a); + } + static inline void Exchange2(vecf &out1, vecf &out2, vecf in1, vecf in2){ + out1 = (vecf)svtrn1((vecd)in1, (vecd)in2); + out2 = (vecf)svtrn2((vecd)in1, (vecd)in2); + } + static inline void Exchange3(vecf &out1, vecf &out2, vecf in1, vecf in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + + // double + static inline void Exchange0(vecd &out1, vecd &out2, vecd in1, vecd in2){ + vecd r1_v = svext(in1, in1, (uint64_t)4u); + vecd r2_v = svext(in2, in2, (uint64_t)4u); + out1 = svext(r1_v, in2, (uint64_t)4u); + out2 = svext(in1, r2_v, (uint64_t)4u); + } + static inline void Exchange1(vecd &out1, vecd &out2, vecd in1, vecd in2){ + // this one is tricky; svtrn2q* from SVE2 fits best, but it is not available in SVE1 + // alternative: use 4-el structure; expect translation into 4x ldp + 4x stp -> SFI + lutd tbl_exch1a = acle::tbl_exch1a(); + lutd tbl_exch1b = acle::tbl_exch1b(); + lutd tbl_exch1c = acle::tbl_exch1c(); + + vecd a1_v = svtbl(in1, tbl_exch1a); + vecd a2_v = svtbl(in2, tbl_exch1b); + vecd b1_v = svext(a2_v, a1_v, (uint64_t)4u); + vecd b2_v = svext(a1_v, a2_v, (uint64_t)4u); + out1 = svtbl(b1_v, tbl_exch1c); + out2 = svtbl(b2_v, tbl_exch1a); + } + static inline void Exchange2(vecd &out1, vecd &out2, vecd in1, vecd in2){ + out1 = svtrn1(in1, in2); + out2 = svtrn2(in1, in2); + } + static inline void Exchange3(vecd &out1, vecd &out2, vecd in1, vecd in2){ + assert(0); + return; + } +}; + +#undef VECTOR_FOR + +struct Permute{ + // float + static inline vecf Permute0(vecf in) { + return svext(in, in, (uint64_t)8u); + } + static inline vecf Permute1(vecf in) { + lutf tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); + } + static inline vecf Permute2(vecf in) { + lutf tbl_swap = acle::tbl2(); + return svtbl(in, tbl_swap); + } + static inline vecf Permute3(vecf in) { + lutf tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); + } + + // double + static inline vecd Permute0(vecd in) { + return svext(in, in, (uint64_t)(8u / 2u)); + } + static inline vecd Permute1(vecd in) { + lutd tbl_swap = acle::tbl1(); + return svtbl(in, tbl_swap); + } + static inline vecd Permute2(vecd in) { + lutd tbl_swap = acle::tbl_swap(); + return svtbl(in, tbl_swap); + } + static inline vecd Permute3(vecd in) { + return in; + } +}; + +struct Rotate{ + + static inline vecf rotate(vecf in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + + case 8: return tRotate<8>(in); break; + case 9: return tRotate<9>(in); break; + case 10: return tRotate<10>(in); break; + case 11: return tRotate<11>(in); break; + case 12: return tRotate<12>(in); break; + case 13: return tRotate<13>(in); break; + case 14: return tRotate<14>(in); break; + case 15: return tRotate<15>(in); break; + default: assert(0); + } + } + static inline vecd rotate(vecd in, int n){ + switch(n){ + case 0: return tRotate<0>(in); break; + case 1: return tRotate<1>(in); break; + case 2: return tRotate<2>(in); break; + case 3: return tRotate<3>(in); break; + case 4: return tRotate<4>(in); break; + case 5: return tRotate<5>(in); break; + case 6: return tRotate<6>(in); break; + case 7: return tRotate<7>(in); break; + default: assert(0); + } + } + + template static inline vecf tRotate(vecf in){ + return svext(in, in, (uint64_t)n); + } + template static inline vecd tRotate(vecd in){ + return svext(in, in, (uint64_t)n); + } +}; + +// tree-based reduction +#define svred(pg, v)\ +svaddv(pg, v); + +// left-to-right reduction +// #define svred(pg, v)\ +// svadda(pg, 0, v) + +template +struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + //exit(1); + return 0; + } +}; +//Complex float Reduce +template <> +inline Grid::ComplexF Reduce::operator()(vecf in){ + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); + float a = svred(pg_even, in); + float b = svred(pg_odd, in); + return Grid::ComplexF(a, b); +} +//Real float Reduce +template <> +inline Grid::RealF Reduce::operator()(vecf in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} +//Complex double Reduce +template <> +inline Grid::ComplexD Reduce::operator()(vecd in){ + pred pg_even = acle::pg_even(); + pred pg_odd = acle::pg_odd(); + double a = svred(pg_even, in); + double b = svred(pg_odd, in); + return Grid::ComplexD(a, b); +} +//Real double Reduce +template <> +inline Grid::RealD Reduce::operator()(vecd in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} +//Integer Reduce +template <> +inline Integer Reduce::operator()(veci in){ + pred pg1 = acle::pg1(); + return svred(pg1, in); +} + +#undef svred + +NAMESPACE_END(Optimization); + +////////////////////////////////////////////////////////////////////////////////////// +// Here assign types + +typedef vech SIMD_Htype; // Reduced precision type +typedef vecf SIMD_Ftype; // Single precision type +typedef vecd SIMD_Dtype; // Double precision type +typedef veci SIMD_Itype; // Integer type + +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; + +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; + +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Div DivSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultAddComplex MultAddComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; +typedef Optimization::MaddRealPart MaddRealPartSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + +NAMESPACE_END(Grid); diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 4584fb36..b9c6a81b 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -32,7 +32,12 @@ Author: Peter Boyle */ //---------------------------------------------------------------------- +#ifdef GRID_CUDA #include +#endif +#ifdef GRID_HIP +#include +#endif namespace Grid { @@ -142,7 +147,7 @@ typedef GpuVector GpuVectorI; accelerator_inline float half2float(half h) { float f; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT f = __half2float(h); #else //f = __half2float(h); @@ -156,7 +161,7 @@ accelerator_inline float half2float(half h) accelerator_inline half float2half(float f) { half h; -#ifdef __CUDA_ARCH__ +#ifdef GRID_SIMT h = __float2half(f); #else Grid_half hh = sfw_float_to_half(f); diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index e2b1fd07..c07077a3 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_vector_types.h @@ -73,7 +73,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; Grid_half o; - + o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; f.u ^= sign; @@ -93,7 +93,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { o.x = static_cast(f.u - denorm_magic.u); } else { unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - + // update exponent, rounding bias part 1 f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; // rounding bias part 2 @@ -101,7 +101,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { // take the bits! o.x = static_cast(f.u >> 13); } - } + } o.x |= static_cast(sign >> 16); return o; } @@ -110,9 +110,63 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GPU_VEC #include "Grid_gpu_vec.h" #endif +/* #ifdef GEN #include "Grid_generic.h" #endif +*/ + +#ifdef GEN + #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here + #include + #if defined(A64FX) // VLA + #pragma message("building A64FX / SVE ACLE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") + #endif + #include "Grid_a64fx-2.h" + #endif + #if defined(A64FXFIXEDSIZE) // fixed size data types + #pragma message("building for A64FX / SVE ACLE fixed size") + #include "Grid_a64fx-fixedsize.h" + #endif + #else + //#pragma message("building GEN") // generic + #include "Grid_generic.h" + #endif +#endif + +#ifdef A64FX + #include + #ifdef __ARM_FEATURE_SVE_BITS + //#pragma message("building A64FX SVE VLS") + #include "Grid_a64fx-fixedsize.h" + #else + #pragma message("building A64FX SVE VLA") + #if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") + #endif + #include "Grid_a64fx-2.h" + #endif +#endif + +/* +#ifdef A64FXVLA +#pragma message("building A64FX VLA") +#if defined(ARMCLANGCOMPAT) + #pragma message("applying data types patch") +#endif +#include +#include "Grid_a64fx-2.h" +#endif + +#ifdef A64FXVLS +#pragma message("building A64FX VLS") +#include +#include "Grid_a64fx-fixedsize.h" +#endif +*/ + #ifdef SSE4 #include "Grid_sse4.h" #endif @@ -163,6 +217,12 @@ template struct is_complex : public std::false_type {}; template <> struct is_complex : public std::true_type {}; template <> struct is_complex : public std::true_type {}; +template struct is_ComplexD : public std::false_type {}; +template <> struct is_ComplexD : public std::true_type {}; + +template struct is_ComplexF : public std::false_type {}; +template <> struct is_ComplexF : public std::true_type {}; + template struct is_real : public std::false_type {}; template struct is_real::value, void>::type> : public std::true_type {}; @@ -170,7 +230,7 @@ template struct is_real struct is_integer : public std::false_type {}; template struct is_integer::value, void>::type> : public std::true_type {}; - + template using IfReal = Invoke::value, int> >; template using IfComplex = Invoke::value, int> >; template using IfInteger = Invoke::value, int> >; @@ -223,6 +283,69 @@ public: return sizeof(Vector_type) / sizeof(Scalar_type); } + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (Scalar_type*)this, svld1(svptrue_b8(), (Scalar_type*)&(rhs.v))); + return *this; + }; + + /* + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b8(), (int8_t*)this, svld1(svptrue_b8(), (int8_t*)&(rhs.v))); + return *this; + }; + */ + + // ComplexF + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b32(), (float*)this, svld1(svptrue_b32(), (float*)&(rhs.v))); + return *this; + }; + + // ComplexD + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &&rhs) { + //v = rhs.v; + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); + return *this; + }; + + template + accelerator_inline Grid_simd &operator=(const Grid_simd::value, S>::type, Vector_type> &rhs) { + //v = rhs.v; + svst1(svptrue_b64(), (double*)this, svld1(svptrue_b64(), (double*)&(rhs.v))); + return *this; + }; + + #else + accelerator_inline Grid_simd &operator=(const Grid_simd &&rhs) { v = rhs.v; return *this; @@ -232,10 +355,23 @@ public: return *this; }; // faster than not declaring it and leaving to the compiler + #endif accelerator Grid_simd() = default; - accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps - accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + + #ifdef ARMCLANGCOMPAT + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &rhs) { this->operator=(rhs); } + template + accelerator_inline Grid_simd(const Grid_simd::value, S>::type, Vector_type> &&rhs) { this->operator=(rhs); } + #else + accelerator_inline Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps + accelerator_inline Grid_simd(const Grid_simd &&rhs) : v(rhs.v){}; + #endif accelerator_inline Grid_simd(const Real a) { vsplat(*this, Scalar_type(a)); }; // Enable if complex type template accelerator_inline @@ -258,12 +394,21 @@ public: /////////////////////////////////////////////// // FIXME -- alias this to an accelerator_inline MAC struct. + + #if defined(A64FX) || defined(A64FXFIXEDSIZE) + friend accelerator_inline void mac(Grid_simd *__restrict__ y, + const Grid_simd *__restrict__ a, + const Grid_simd *__restrict__ x) { + *y = fxmac((*a), (*x), (*y)); + }; + #else friend accelerator_inline void mac(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ a, const Grid_simd *__restrict__ x) { *y = (*a) * (*x) + (*y); }; - + #endif + friend accelerator_inline void mult(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -412,7 +557,7 @@ public: Grid_simd ret; Grid_simd::conv_t conv; Grid_simd::scalar_type s; - + conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { s = conv.s[i]; @@ -441,7 +586,7 @@ public: return ret; } /////////////////////// - // Exchange + // Exchange // Al Ah , Bl Bh -> Al Bl Ah,Bh /////////////////////// friend accelerator_inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) @@ -452,20 +597,20 @@ public: Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } else if(n==1) { Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); - } else if(n==0) { + } else if(n==0) { Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } } - friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); } - friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ + friend accelerator_inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){ Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); } //////////////////////////////////////////////////////////////////// @@ -490,7 +635,7 @@ public: int dist = perm & 0xF; y = rotate(b, dist); return; - } + } else if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); @@ -564,29 +709,29 @@ accelerator_inline Grid_simd rotate(Grid_simd b, int nrot) { ret.v = Optimization::Rotate::rotate(b.v, 2 * nrot); return ret; } -template =0> +template =0> accelerator_inline void rotate( Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,nrot); } -template =0> +template =0> accelerator_inline void rotate(Grid_simd &ret,Grid_simd b,int nrot) { nrot = nrot % Grid_simd::Nsimd(); ret.v = Optimization::Rotate::rotate(b.v,2*nrot); } -template +template accelerator_inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); -} -template =0> +} +template =0> accelerator_inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; ret.v = unary(real(typepun[lane]), VsplatSIMD()); -} +} @@ -741,6 +886,27 @@ accelerator_inline Grid_simd operator*(Grid_simd a, Grid_simd return ret; }; +// ---------------- A64FX MAC ------------------- +// Distinguish between complex types and others +#if defined(A64FX) || defined(A64FXFIXEDSIZE) +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultAddComplexSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MultSIMD()); + return ret; +}; +#endif +// ---------------------------------------------- + + // Distinguish between complex types and others template = 0> accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { @@ -877,7 +1043,7 @@ accelerator_inline typename toComplexMapper::Complexified toComplex(const conv.v = in.v; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + assert(conv.s[i + 1] == conv.s[i]); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match @@ -919,6 +1085,14 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) for(int m=0;m*2 #endif @@ -65,7 +65,7 @@ typedef RealD Real; typedef RealF Real; #endif -#ifdef GRID_NVCC +#if defined(GRID_CUDA) || defined(GRID_HIP) typedef thrust::complex ComplexF; typedef thrust::complex ComplexD; typedef thrust::complex Complex; diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py new file mode 100755 index 00000000..f00a5019 --- /dev/null +++ b/Grid/simd/gridverter.py @@ -0,0 +1,2377 @@ +#!/usr/bin/python3 + +import re +import argparse +import sys + +# Grid for A64FX +# +# * should align std::vector to (multiples of) cache block size = 256 bytes + +# place benchmark runtime in cycles here ! +measured_cycles = 690 #1500 #775 #1500 + + +# command line parser +parser = argparse.ArgumentParser(description="Dslash generator.") +parser.add_argument("--single", action="store_true", default="False") +parser.add_argument("--double", action="store_true", default="True") +parser.add_argument("--debug", action="store_true", default="False") +parser.add_argument("--gridbench", action="store_true", default="False") +args = parser.parse_args() + +print(args) + +ASM_LOAD_CHIMU = True # load chimu +ASM_LOAD_GAUGE = True # load gauge +ASM_LOAD_TABLE = True # load table +ASM_STORE = True # store result + +# Disable all loads and stores in asm for benchmarking purposes +#DISABLE_ASM_LOAD_STORE = True +DISABLE_ASM_LOAD_STORE = False + +if DISABLE_ASM_LOAD_STORE: + ASM_LOAD_CHIMU = True # load chimu + ASM_LOAD_GAUGE = True # load gauge + ASM_LOAD_TABLE = True # load table + ASM_STORE = False # store result + +# Alternative implementation using PROJ specific loads works, +# but be careful with predication + +ALTERNATIVE_LOADS = False +#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True + +# Alternative register mapping, +# must use with my_wilson4.h and my_wilson4pf.h + +ALTERNATIVE_REGISTER_MAPPING = False +#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING + +if ALTERNATIVE_REGISTER_MAPPING == True: + ALTERNATIVE_LOADS = False + +# use movprfx +MOVPRFX = False +MOVPRFX = not MOVPRFX + + +PREFETCH = False +PREFETCH = not PREFETCH # True + +PRECISION = 'double' # DP by default +PRECSUFFIX = 'A64FXd' +if args.single == True: + PRECISION = 'single' + PRECSUFFIX = 'A64FXf' + +_DEBUG = False #True # insert debugging output +if args.debug == True: + _DEBUG = True + +GRIDBENCH = False +if args.gridbench == True: + GRIDBENCH = True + +print("PRECISION = ", PRECISION) +print("DEBUG = ", _DEBUG) +print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) +print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) +print("MOVPRFX = ", MOVPRFX) +print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) +print("GRIDBENCH = ", GRIDBENCH) + +print("") + +#sys.exit(0) + + +#_DEBUG = True # insert debugging output + +FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing +STORE_BASE_PTR_COLOR_OFFSET = 2 + +# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! +# table address: x30 +# data address: x29 +# store address: x28 +# debug address: r8 + +# Max performance of complex FMA using FCMLA instruction +# is 25% peak. +# +# Issue latency of FCMLA is 2 cycles. +# Need 2 FCMLA instructions for complex FMA. +# Complete complex FMA takes 4 cycles. +# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. +# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. +# -> 25% peak FMA +# +# In: 3x 512 bits = 192 bytes +# Out: 1x 512 bits = 64 bytes +# Tot: 4x 512 bits = 256 bytes +# +# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) + +OPT = """ +* interleave prefetching and compute in MULT_2SPIN +* could test storing U's in MULT_2SPIN to L1d for cache line update +* structure reordering: MAYBEPERM after MULT_2SPIN ? +""" + +filename = 'XXX' +LEGAL = """/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: {} + + Copyright (C) 2020 + +Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +""" + +class Register: + + def __init__(self, variable, asmreg='X', predication=False): + global d + x = 'Y' + if predication == False: + x = asmreg # + d['asmsuffix'] + else: + x = asmreg + self.asmreg = x + self.asmregwithsuffix = asmreg + d['asmsuffix'] + self.asmregbyte = asmreg + '.b' + self.name = variable + self.asmname = variable + self.asmnamebyte = variable + '.b' + self.predication = predication + + d['registers'] += 1 + + def define(self, statement): + global d + d['C'] += F'#define {self.name} {statement}' + #d['A'] += F'#define {self.name} {statement}' + + def declare(self, predication=False): + global d + + if self.predication == False: + d['C'] += F' Simd {self.name}; \\\n' + + predtype = 'svfloat64_t' + if PRECISION == 'single': + predtype = 'svfloat32_t' + + d['I'] += F' {predtype} {self.name}; \\\n' + else: + d['I'] += F' svbool_t {self.name}; \\\n' + #d['A'] += F'#define {self.name} {self.asmreg} \n' + + def loadpredication(self, target='A'): + global d + if (target == 'A'): + d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' + d['asmclobber'].append(F'"{self.asmreg}"') + + def loadtable(self, t): + global d + d['load'] += d['factor'] + gpr = d['asmtableptr'] + + cast = 'uint64_t' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + # asm_opcode = 'ld1w' + # cast = 'uint32_t' + asm_opcode = 'ldr' + if PRECISION == 'single': + asm_opcode = 'ldr' + cast = 'uint32_t' + + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' + + # using immediate index break-out works + if asm_opcode == 'ldr': + # ldr version + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + else: + # ld1 version + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + + d['asminput'].append(F'[tableptr] "r" (&lut[0])') + d['asminput'].append(F'[index] "i" ({t})') + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + + def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): + global d + d['load'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + #asm_opcode = 'ld1w' + # cast = 'float32_t' + + asm_opcode = 'ldr' + if PRECISION == 'single': + asm_opcode = 'ldr' + cast = 'float32_t' + + gpr = d['asmfetchbaseptr'] + intrinfetchbase = d['intrinfetchbase'] + if (target in ['ALL', 'C']): + d['C'] += F' {self.name} = {address}; \\\n' + if (target in ['ALL', 'I']): +# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' + d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' + if (target in ['ALL', 'A']): + if asm_opcode == 'ldr': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + + def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): + global d + d['store'] += d['factor'] + indices = re.findall(r'\d+', address) + index = (int(indices[0]) - offset) * colors + int(indices[1]) + + #asm_opcode = 'stnt1d' + #if PRECISION == 'single': + # asm_opcode = 'stnt1w' + # cast = 'float32_t' + asm_opcode = 'str' + if PRECISION == 'single': + asm_opcode = 'str' + cast = 'float32_t' + + intrinstorebase = d['intrinstorebase'] + + d['C'] += F' {address} = {self.name}; \\\n' + #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + if asm_opcode == 'str': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + + def movestr(self, str): + global d + #d['move'] += d['factor'] + d['I'] += F' {self.name} = {str}; \\\n' + + def move(self, op1): + global d + d['move'] += d['factor'] + d['C'] += F' {self.name} = {op1.name}; \\\n' + d['I'] += F' {self.name} = {op1.name}; \\\n' + d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + + # a = a + b , a = b + c + def add(self, op1, op2=None): + global d + d['add'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' + d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a -b , a = b - c + def sub(self, op1, op2=None): + global d + d['sub'] += d['factor'] + if op2 is None: + d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' + d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' + d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' + + # a = a * b , a = b * c + def mul(self, op1, op2): + global d + d['mul'] += 2 * d['factor'] + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = __svzero({self.name}); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mul0(self, op1, op2, op3=None, constructive=False): + global d + d['mul'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mul1(self, op1, op2): + global d + d['mul'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac(self, op1, op2): + global d + d['mac'] += 2 * d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def mac0(self, op1, op2): + global d + d['mac'] += d['factor'] + d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' + + def mac1(self, op1, op2): + global d + d['mac'] += d['factor'] + d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def zero(self, zeroreg=False): + d['zero'] += d['factor'] + d['C'] += F' {self.name} = 0; \\\n' + #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang + + if PRECISION == 'double': + d['I'] += F' {self.name} = svdup_f64(0.); \\\n' + else: + d['I'] += F' {self.name} = svdup_f32(0.); \\\n' + + if zeroreg == True: + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + else: + #using mov z, zero0 issue 1c, FLA, latency 6c + #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using mov z, 0 issue 1c, FLA, latency 6c + d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' + + #using xor z, z, z issue 0.5c, FL*, latency 4c + #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + #using and z, z, zero0 issue 0.5c, FL*, latency 4c + #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' + + #using sub z, z, z issue 0.5c, FL*, latency 9c + #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' + + # without table + def timesI(self, op1, tempreg=None, tablereg=None): + global d + d['timesI'] += d['factor'] + d['C'] += F' {self.name} = timesI({op1.name}); \\\n' + # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this + #table.load('table2', target='I', cast='uint64_t') + #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' + #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' + # timesI using trn tested, works but tbl should be faster + d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' + d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' + d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' + + def addTimesI(self, op1, op2=None, constructive=False): + global d + d['addTimesI'] += d['factor'] + + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' + + def subTimesI(self, op1, op2=None, constructive=False): + global d + d['subTimesI'] += d['factor'] + + # no movprfx intrinsics support + if constructive == True: + d['movprfx'] += d['factor'] + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + if op2 is None: + d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' + else: + d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' + d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' + d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' + + # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table + def timesMinusI(self, op1): + global d + d['timesMinusI'] += d['factor'] + d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' + d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' + d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' + + def permute(self, dir, tablereg=None): + global d + d['permutes'] += d['factor'] + + d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' + + d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + # if dir == 0: + # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' + # # this might not work, see intrinsics assembly + # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' + # # use registers directly + # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' + # + # elif dir in [1, 2]: + # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' + # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' + + def debug(self): + global d + typecast = d['cfloat'] + gpr = d['asmdebugptr'] + vregs = d['asmclobberlist'] + if (d['debug'] == True): + d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' + d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' + + d['A'] += F'asm ( \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' + d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier + d['A'] += F' : "=m" (debugreg.v) \\\n' + d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' + d['A'] += F' : "p5", "cc", "memory" \\\n' + d['A'] += F'); \\\n' + d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' + # this form of addressing is not valid! + #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' +# end Register + +def define(s, target='ALL'): + x = F'#define {s} \n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def definemultiline(s): + x = F'#define {s} \\\n' + global d + d['C'] += x + d['I'] += x + d['A'] += x + +def write(s, target='ALL'): + x = F'{s}\n' + global d + if (target in ['ALL', 'C']): + d['C'] += x + if (target in ['ALL', 'I']): + d['I'] += x + if (target in ['ALL', 'A']): + d['A'] += x + +def curlyopen(): + write(F'{{ \\') + +def curlyclose(): + write(F'}}') + +def newline(target='ALL'): + global d + + if target == 'A': + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + else: + if d['C'][-2:] == '\\\n': + d['C'] = d['C'][:-2] + '\n\n' + if d['I'][-2:] == '\\\n': + d['I'] = d['I'][:-2] + '\n\n' + if d['A'][-2:] == '\\\n': + d['A'] = d['A'][:-2] + '\n\n' + +# load the base pointer for fetches +def fetch_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + + # DEBUG + #colors=3 + #indices = re.findall(r'\d+', address) + #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) + #print(F'{address} (base)') + + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[fetchptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + #print("intrinfetchbase = ", address) + d['intrinfetchbase'] = address + +# load the base pointer for stores +def store_base_ptr(address, target='A'): + global d + #d['load'] += d['factor'] + gpr = d['asmstorebaseptr'] + vregs = d['asmclobberlist'] + if target == 'A': + d['asminput'].append(F'[storeptr] "r" ({address})') + d['asmclobber'].extend(vregs) + d['asmclobber'].append(F'"memory"') + d['asmclobber'].append(F'"cc"') + if target == 'I': + d['intrinstorebase'] = address + +def prefetch_L1(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL1STRM" # weak + #policy = "PLDL1KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + +def prefetch_L2(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PLDL2STRM" # weak + #policy = "PLDL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + #d['A'] += + +def prefetch_L2_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL2STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + +def prefetch_L1_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL1STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + + +def asmopen(): + #write('asm volatile ( \\', target='A') + write('asm ( \\', target='A') + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + #write('asm volatile ( \\', target='A') + +def asmclose(): + global d + + #print(d['asminput']) + + asmin = d['asminput'] + asmin_s = '' + if len(asmin) > 0: + asmin = list(dict.fromkeys(asmin)) # remove duplicates + #print(asmin) + for el in asmin: + asmin_s += el + ',' + asmin_s = asmin_s[:-1] + #print("-> ", asmin_s) + + d['asminput'] = [] + + asmout = d['asmoutput'] + asmout_s = '' + if len(asmout) > 0: + asmout = list(dict.fromkeys(asmout)) # remove duplicates + for el in asmout: + asmout_s += el + ',' + asmout_s = asmout_s[:-1] + + d['asmoutput'] = [] + + # DEBUG put all regs into clobber by default + d['asmclobber'].extend(d['asmclobberlist']) + + asmclobber = d['asmclobber'] + asmclobber_s = '' + #print(asmclobber) + if len(asmclobber) > 0: + asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates + for el in asmclobber: + asmclobber_s += el + ',' + asmclobber_s = asmclobber_s[:-1] + + d['asmclobber'] = [] + + # DEBUG + #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier + + + write(F' : {asmout_s} \\', target='A') + write(F' : {asmin_s} \\', target='A') + write(F' : {asmclobber_s} \\', target='A') + write('); \\', target='A') + +# -------------------------------------------------------------------------------- + +# string of vector registers to be used in clobber list +#clobberlist = ['"p0"'] +clobberlist = ['"p5"'] +clobberlist.append('"cc"') +for i in range(0, 32): + clobberlist.append(F'"z{i}"') + +d = { +'debug': _DEBUG, +'C': '', +'I': '', +'A': '', +'asmsuffix': '.d', # double precision by default +'cfloat': 'float64_t', +'registers': 0, +'load': 0, +'store': 0, +'move': 0, +'movprfx': 0, +'zero': 0, +'add': 0, +'sub': 0, +'mul': 0, +'mac': 0, +'permutes': 0, +'neg': 0, +'addTimesI': 0, +'subTimesI': 0, +'timesI': 0, +'timesMinusI': 0, +'flops': 0, +'factor': 1, # multiplicity +'asmtableptr': 'x30', +'asmfetchbaseptr': 'x29', +'asmstorebaseptr': 'x28', +'asmdebugptr': 'r12', +'asminput': [], +'asmoutput': [], +'asmclobber': [], +'asmclobberlist': clobberlist, +'intrinfetchbase': '', +'intrinstorebase': '', +'cycles_LOAD_CHIMU': 0, +'cycles_PROJ': 0, +'cycles_PERM': 0, +'cycles_MULT_2SPIN': 0, +'cycles_RECON': 0, +'cycles_RESULT': 0, +'cycles_ZERO_PSI': 0, +'cycles_PREFETCH_L1': 0, +'cycles_PREFETCH_L2': 0 +} + +if PRECISION == 'single': + d['asmsuffix'] = '.s' + d['cfloat'] = 'float32_t' + +# -------------------------------------------------------------------------------- +# Grid +# -------------------------------------------------------------------------------- + +# Variables / Registers +result_00 = Register('result_00', asmreg='z0') +result_01 = Register('result_01', asmreg='z1') +result_02 = Register('result_02', asmreg='z2') +result_10 = Register('result_10', asmreg='z3') +result_11 = Register('result_11', asmreg='z4') +result_12 = Register('result_12', asmreg='z5') +result_20 = Register('result_20', asmreg='z6') +result_21 = Register('result_21', asmreg='z7') +result_22 = Register('result_22', asmreg='z8') +result_30 = Register('result_30', asmreg='z9') +result_31 = Register('result_31', asmreg='z10') +result_32 = Register('result_32', asmreg='z11') # 12 Regs +Chi_00 = Register('Chi_00', asmreg='z12') +Chi_01 = Register('Chi_01', asmreg='z13') +Chi_02 = Register('Chi_02', asmreg='z14') +Chi_10 = Register('Chi_10', asmreg='z15') +Chi_11 = Register('Chi_11', asmreg='z16') +Chi_12 = Register('Chi_12', asmreg='z17') # 6 +UChi_00 = Register('UChi_00', asmreg='z18') +UChi_01 = Register('UChi_01', asmreg='z19') +UChi_02 = Register('UChi_02', asmreg='z20') +UChi_10 = Register('UChi_10', asmreg='z21') +UChi_11 = Register('UChi_11', asmreg='z22') +UChi_12 = Register('UChi_12', asmreg='z23') # 6 +U_00 = Register('U_00', asmreg='z24') +U_10 = Register('U_10', asmreg='z25') +U_20 = Register('U_20', asmreg='z26') +U_01 = Register('U_01', asmreg='z27') +U_11 = Register('U_11', asmreg='z28') +U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers + +table0 = Register('table0', asmreg='z30') +zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers +# can't overload temp1 / table due to type mismatch using intrinsics :( +# typecasting SVE intrinsics variables is not allowed + +pg1 = Register('pg1', predication=True, asmreg='p5') +#pg2 = Register('pg2', predication=True, asmreg='p1') + +# Overloaded with Chi_* and UChi_* +Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) +Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) +Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) +Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) +Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) +Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) +if ALTERNATIVE_REGISTER_MAPPING == False: + Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers +else: # wilson4.h + Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) + Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) + Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) + Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) + Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) + Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) + +# debugging output +def debugall(msg=None, group='ALL'): + global d + if (d['debug'] == False): + return + write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') + if (group in ['ALL', 'result']): + result_00.debug() + result_01.debug() + result_02.debug() + result_10.debug() + result_11.debug() + result_12.debug() + result_20.debug() + result_21.debug() + result_22.debug() + result_30.debug() + result_31.debug() + result_32.debug() + if (group in ['ALL', 'Chi']): + Chi_00.debug() + Chi_01.debug() + Chi_02.debug() + Chi_10.debug() + Chi_11.debug() + Chi_12.debug() + if (group in ['ALL', 'UChi']): + UChi_00.debug() + UChi_01.debug() + UChi_02.debug() + UChi_10.debug() + UChi_11.debug() + UChi_12.debug() + if (group in ['ALL', 'U']): + U_00.debug() + U_10.debug() + U_20.debug() + U_01.debug() + U_11.debug() + U_21.debug() + if (group in ['ALL', 'Chimu']): + Chimu_00.debug() + Chimu_01.debug() + Chimu_02.debug() + Chimu_10.debug() + Chimu_11.debug() + Chimu_12.debug() + Chimu_20.debug() + Chimu_21.debug() + Chimu_22.debug() + Chimu_30.debug() + Chimu_31.debug() + Chimu_32.debug() + +# -------------------------------------------------------------------------------- +# Output +# -------------------------------------------------------------------------------- + +if ALTERNATIVE_LOADS == True: + define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') + define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') + define(F'LOAD_CHIMU(x)') +else: + #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') + define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') + +if PREFETCH: + define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH_CHIMU(A)') +else: + define(F'PREFETCH_CHIMU_L1(A)') + define(F'PREFETCH_GAUGE_L1(A)') + define(F'PREFETCH_CHIMU_L2(A)') + define(F'PREFETCH_GAUGE_L2(A)') + define(F'PF_GAUGE(A)') + define(F'PREFETCH1_CHIMU(A)') + define(F'PREFETCH_CHIMU(A)') + define(F'PREFETCH_RESULT_L2_STORE(A)') + +# standard defines +define(F'LOCK_GAUGE(A)') +define(F'UNLOCK_GAUGE(A)') +define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') +define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') +define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') +define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') +# don't need zero psi, everything is done in recons +#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') +define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') +# loads projections +define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') +define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') +define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') +define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') +define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') +define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') +define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') +define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') +# recons +define(F'XP_RECON XP_RECON_{PRECSUFFIX}') +define(F'XM_RECON XM_RECON_{PRECSUFFIX}') +define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') +define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') +define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') +define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') +define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') +define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') +define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') +# new permutes +define(F'PERMUTE_DIR0 0') +define(F'PERMUTE_DIR1 1') +define(F'PERMUTE_DIR2 2') +define(F'PERMUTE_DIR3 3') +define(F'PERMUTE PERMUTE_{PRECSUFFIX};') +# load table +#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') +if PRECISION == 'double': + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') + define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') +else: + define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') + define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') + + + +write('// DECLARATIONS') +definemultiline(F'DECLARATIONS_{PRECSUFFIX}') +# debugging register +if d['debug'] == True: + write(' Simd debugreg; \\') +# perm tables +if PRECISION == 'double': + write(' const uint64_t lut[4][8] = { \\') + write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves + write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im + write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity +else: + write(' const uint32_t lut[4][16] = { \\') + write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves + write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves + write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves + write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im + +#newline(target='A') +result_00.declare() +result_01.declare() +result_02.declare() +result_10.declare() +result_11.declare() +result_12.declare() +result_20.declare() +result_21.declare() +result_22.declare() +result_30.declare() +result_31.declare() +result_32.declare() # 12 +Chi_00.declare() +Chi_01.declare() +Chi_02.declare() +Chi_10.declare() +Chi_11.declare() +Chi_12.declare() # 6 +UChi_00.declare() +UChi_01.declare() +UChi_02.declare() +UChi_10.declare() +UChi_11.declare() +UChi_12.declare() # 6 +U_00.declare() +U_10.declare() +U_20.declare() +U_01.declare() +U_11.declare() +U_21.declare() # 6 -> 30 regs + +# all predications true +pg1.declare() +if PRECISION == 'double': + pg1.movestr('svptrue_b64()') +else: + pg1.movestr('svptrue_b32()') + +# tables +if PRECISION == 'double': + write(' svuint64_t table0; \\', target='I') # -> 31 regs +else: + write(' svuint32_t table0; \\', target='I') # -> 31 regs + +zero0.declare() + +# zero register +asmopen() +zero0.zero(zeroreg=True) +asmclose() +newline() + +define('Chimu_00 Chi_00', target='I') +define('Chimu_01 Chi_01', target='I') +define('Chimu_02 Chi_02', target='I') +define('Chimu_10 Chi_10', target='I') +define('Chimu_11 Chi_11', target='I') +define('Chimu_12 Chi_12', target='I') +if ALTERNATIVE_REGISTER_MAPPING == False: + define('Chimu_20 UChi_00', target='I') + define('Chimu_21 UChi_01', target='I') + define('Chimu_22 UChi_02', target='I') + define('Chimu_30 UChi_10', target='I') + define('Chimu_31 UChi_11', target='I') + define('Chimu_32 UChi_12', target='I') +else: # wilson4.h + define('Chimu_20 U_00', target='I') + define('Chimu_21 U_10', target='I') + define('Chimu_22 U_20', target='I') + define('Chimu_30 U_01', target='I') + define('Chimu_31 U_11', target='I') + define('Chimu_32 U_21', target='I') +newline() + + +d['cycles_RESULT'] += 12 +write('// RESULT') +definemultiline(F'RESULT_{PRECSUFFIX}(base)') +if ASM_STORE: + curlyopen() + #write(' SiteSpinor & ref(out[ss]); \\') + asmopen() + #pg1.loadpredication() + #store_base_ptr("&ref[0][0]") + #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + result_00.store("ref[0][0]") + result_01.store("ref[0][1]") + result_02.store("ref[0][2]") + result_10.store("ref[1][0]") + result_11.store("ref[1][1]") + result_12.store("ref[1][2]") + result_20.store("ref[2][0]") + result_21.store("ref[2][1]") + result_22.store("ref[2][2]") + result_30.store("ref[3][0]") + result_31.store("ref[3][1]") + result_32.store("ref[3][2]") + asmclose() + debugall('RESULT', group='result') + curlyclose() +newline() + +# prefetch spinors from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"base", target='A') +prefetch_L2(F"base", 0) +prefetch_L2(F"base", 1) +prefetch_L2(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch spinors from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_CHIMU_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +#pg1.loadpredication() +fetch_base_ptr(F"base", target='A') +prefetch_L1(F"base", 0) +prefetch_L1(F"base", 1) +prefetch_L1(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L2 (prefetch to L2)') +definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +else: + write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L2(F"baseU", -1) +prefetch_L2(F"baseU", 0) +prefetch_L2(F"baseU", 1) +prefetch_L2(F"baseU", 2) +prefetch_L2(F"baseU", 3) +prefetch_L2(F"baseU", 4) +prefetch_L2(F"baseU", 5) +prefetch_L2(F"baseU", 6) +prefetch_L2(F"baseU", 7) +#prefetch_L2(F"baseU", 8) +asmclose() +curlyclose() +newline() + +# prefetch gauge from memory into L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_GAUGE_L1 (prefetch to L1)') +definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') +curlyopen() +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") +fetch_base_ptr(F"baseU", target='A') +prefetch_L1(F"baseU", 0) +prefetch_L1(F"baseU", 1) +prefetch_L1(F"baseU", 2) +asmclose() +curlyclose() +newline() + +d['factor'] = 0 +write('// LOAD_CHI') +definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + fetch_base_ptr(F"base", target='I') + fetch_base_ptr(F"base", target='A') + + Chi_00.load("ref[0][0]", offset=0) + Chi_01.load("ref[0][1]", offset=0) + Chi_02.load("ref[0][2]", offset=0) + Chi_10.load("ref[1][0]", offset=0) + Chi_11.load("ref[1][1]", offset=0) + Chi_12.load("ref[1][2]", offset=0) + asmclose() + debugall('LOAD_CHI', group='Chi') + curlyclose() +newline() + + + +d['factor'] = 8 +# 12 loads = 12 issues, load latency = 8+1 cycles +# (not perfectly clear to me from docs) +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU') +definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') +if ASM_LOAD_CHIMU: + curlyopen() + #write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + #fetch_base_ptr("&ref[0][0]") + #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') + fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + # Chimu_00.load("ref[0][0]") + # Chimu_01.load("ref[0][1]") + # Chimu_02.load("ref[0][2]") + # Chimu_10.load("ref[1][0]") + # Chimu_11.load("ref[1][1]") + # Chimu_12.load("ref[1][2]") + # Chimu_20.load("ref[2][0]") + # Chimu_21.load("ref[2][1]") + # Chimu_22.load("ref[2][2]") + # Chimu_30.load("ref[3][0]") + # Chimu_31.load("ref[3][1]") + # Chimu_32.load("ref[3][2]") + + Chimu_00.load("ref[0][0]") # minimum penalty for all directions + Chimu_30.load("ref[3][0]") + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0213 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0213') +definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_20.load("ref[2][0]") + + Chimu_01.load("ref[0][1]") + Chimu_21.load("ref[2][1]") + + Chimu_02.load("ref[0][2]") + Chimu_22.load("ref[2][2]") + + Chimu_10.load("ref[1][0]") + Chimu_30.load("ref[3][0]") + + Chimu_11.load("ref[1][1]") + Chimu_31.load("ref[3][1]") + + Chimu_12.load("ref[1][2]") + Chimu_32.load("ref[3][2]") + asmclose() + debugall('LOAD_CHIMU_0213', group='Chimu') + curlyclose() +newline() + +# alternative load chimu: dirac order 0312 +# placed into asm (...) +d['factor'] = 0 +d['cycles_LOAD_CHIMU'] += 11 * d['factor'] +write('// LOAD_CHIMU_0312') +definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') +if ASM_LOAD_CHIMU: + curlyopen() + write(' const SiteSpinor & ref(in[offset]); \\') + asmopen() + pg1.loadpredication() + fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") + Chimu_00.load("ref[0][0]") # reordered + Chimu_30.load("ref[3][0]") + + Chimu_01.load("ref[0][1]") + Chimu_31.load("ref[3][1]") + + Chimu_02.load("ref[0][2]") + Chimu_32.load("ref[3][2]") + + Chimu_10.load("ref[1][0]") + Chimu_20.load("ref[2][0]") + + Chimu_11.load("ref[1][1]") + Chimu_21.load("ref[2][1]") + + Chimu_12.load("ref[1][2]") + Chimu_22.load("ref[2][2]") + asmclose() + debugall('LOAD_CHIMU_0312', group='Chimu') + curlyclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE0') +definemultiline(F'LOAD_TABLE0') +asmopen() +table0.loadtable(0) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE1') +definemultiline(F'LOAD_TABLE1') +asmopen() +table0.loadtable(1) +asmclose() +newline() + +d['factor'] = 2 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE2') +definemultiline(F'LOAD_TABLE2') +asmopen() +table0.loadtable(2) +asmclose() +newline() + +d['factor'] = 0 +d['cycles_PERM'] += 1 * d['factor'] +write('// LOAD_TABLE3') +definemultiline(F'LOAD_TABLE3') +asmopen() +table0.loadtable(3) +asmclose() +newline() + +d['factor'] = 2 # factor is 2 +d['cycles_PERM'] += 6 * d['factor'] +write('// PERMUTE') +definemultiline(F'PERMUTE_{PRECSUFFIX}') +debugall('PERM PRE', group='Chi') +asmopen() +#table0.loadtable(2) +Chi_00.permute(2, table0) +Chi_01.permute(2, table0) +Chi_02.permute(2, table0) +Chi_10.permute(2, table0) +Chi_11.permute(2, table0) +Chi_12.permute(2, table0) +asmclose() +debugall('PERM POST', group='Chi') +newline() + +write('// LOAD_GAUGE') +definemultiline(F'LOAD_GAUGE') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +curlyopen() +asmopen() +pg1.loadpredication() +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +if ASM_LOAD_GAUGE: + fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") +asmclose() +curlyclose() +newline() + +d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total +# assume all U loads are hidden +# FCMLA issue latency = 2 cycles +# measurement: latency = 16 cycles if FULLY pipelined !? +# spec says 6+6+9 cycles +# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 +d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] +write('// MULT_2SPIN') +definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') +curlyopen() +#write(' const auto & ref(U[sU][A]); \\') +if GRIDBENCH: # referencing differs in Grid and GridBench + write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') +else: + write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') +asmopen() +#pg1.loadpredication() +#fetch_base_ptr("&ref[0][0]") +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') +fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') +#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') +#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") +if ASM_LOAD_GAUGE: + U_00.load("ref[0][0]") + U_10.load("ref[1][0]") + U_20.load("ref[2][0]") + U_01.load("ref[0][1]") + U_11.load("ref[1][1]") + U_21.load("ref[2][1]") + +if MOVPRFX == False: + UChi_00.zero() # implementation specific + UChi_10.zero() + UChi_01.zero() + UChi_11.zero() + UChi_02.zero() + UChi_12.zero() + + # round 1 + UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(U_00, Chi_10) + UChi_01.mul0(U_10, Chi_00) + UChi_11.mul0(U_10, Chi_10) + UChi_02.mul0(U_20, Chi_00) + UChi_12.mul0(U_20, Chi_10) +else: + # round 1 + UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles + UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) + UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) + UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) + UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) + UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) + +# round 2 +UChi_00.mul1(U_00, Chi_00) +UChi_10.mul1(U_00, Chi_10) +UChi_01.mul1(U_10, Chi_00) +UChi_11.mul1(U_10, Chi_10) +UChi_02.mul1(U_20, Chi_00) +UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here + +if ASM_LOAD_GAUGE: + U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded + U_10.load("ref[1][2]") # early load + U_20.load("ref[2][2]") # A --> +asmclose() +debugall('MULT_2SPIN_1', group='UChi') +curlyclose() +newline() + +write('// MULT_2SPIN_BACKEND') +definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') +curlyopen() +asmopen() +# round 3 +UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and +UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) +UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics +UChi_11.mac0(U_11, Chi_11) +UChi_02.mac0(U_21, Chi_01) +UChi_12.mac0(U_21, Chi_11) +# round 4 +UChi_00.mac1(U_01, Chi_01) +UChi_10.mac1(U_01, Chi_11) +UChi_01.mac1(U_11, Chi_01) +UChi_11.mac1(U_11, Chi_11) +UChi_02.mac1(U_21, Chi_01) +UChi_12.mac1(U_21, Chi_11) +# round 5 +UChi_00.mac0(U_00, Chi_02) # <-- A +UChi_10.mac0(U_00, Chi_12) +UChi_01.mac0(U_10, Chi_02) +UChi_11.mac0(U_10, Chi_12) +UChi_02.mac0(U_20, Chi_02) +UChi_12.mac0(U_20, Chi_12) +# round 6 +UChi_00.mac1(U_00, Chi_02) +UChi_10.mac1(U_00, Chi_12) +UChi_01.mac1(U_10, Chi_02) +UChi_11.mac1(U_10, Chi_12) +UChi_02.mac1(U_20, Chi_02) +UChi_12.mac1(U_20, Chi_12) +asmclose() +debugall('MULT_2SPIN_2', group='UChi') +curlyclose() +newline() + + +#// hspin(0)=fspin(0)+timesI(fspin(3)); +#// hspin(1)=fspin(1)+timesI(fspin(2)); +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XP_PROJ') +definemultiline(F'XP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_30) +Chi_01.addTimesI(Chimu_01, Chimu_31) +Chi_02.addTimesI(Chimu_02, Chimu_32) +Chi_10.addTimesI(Chimu_10, Chimu_20) +Chi_11.addTimesI(Chimu_11, Chimu_21) +Chi_12.addTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XP_PROJ', group='Chi') +curlyclose() +newline() + +#// fspin(0)=hspin(0); +#// fspin(1)=hspin(1); +#// fspin(2)=timesMinusI(hspin(1)); +#// fspin(3)=timesMinusI(hspin(0)); +# does not occur in GridBench +d['factor'] = 0 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON') +definemultiline(F'XP_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.subTimesI(UChi_10) + result_21.subTimesI(UChi_11) + result_22.subTimesI(UChi_12) + result_30.subTimesI(UChi_00) + result_31.subTimesI(UChi_01) + result_32.subTimesI(UChi_02) +else: + result_20.subTimesI(zero0, UChi_10, constructive=True) + result_21.subTimesI(zero0, UChi_11, constructive=True) + result_22.subTimesI(zero0, UChi_12, constructive=True) + result_30.subTimesI(zero0, UChi_00, constructive=True) + result_31.subTimesI(zero0, UChi_01, constructive=True) + result_32.subTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) # don't reorder ! +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) + +# result_00.add(UChi_00) # faster than move? +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +asmclose() +debugall('XP_RECON', group='result') +newline() + + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XP_RECON_ACCUM') +definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_10) +# result_21.subTimesI(UChi_11) +# result_22.subTimesI(UChi_12) +# result_30.subTimesI(UChi_00) +# result_31.subTimesI(UChi_01) +# result_32.subTimesI(UChi_02) +# +# result_00.add(UChi_00) # reordered +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_31.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_32.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_20.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_21.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_22.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('XP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YP_PROJ') +definemultiline(F'YP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_30) +Chi_01.sub(Chimu_01, Chimu_31) +Chi_02.sub(Chimu_02, Chimu_32) +Chi_10.add(Chimu_10, Chimu_20) +Chi_11.add(Chimu_11, Chimu_21) +Chi_12.add(Chimu_12, Chimu_22) +asmclose() +debugall('YP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZP_PROJ') +definemultiline(F'ZP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.addTimesI(Chimu_00, Chimu_20) +Chi_01.addTimesI(Chimu_01, Chimu_21) +Chi_02.addTimesI(Chimu_02, Chimu_22) +Chi_10.subTimesI(Chimu_10, Chimu_30) +Chi_11.subTimesI(Chimu_11, Chimu_31) +Chi_12.subTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZP_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TP_PROJ') +definemultiline(F'TP_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_20) +Chi_01.add(Chimu_01, Chimu_21) +Chi_02.add(Chimu_02, Chimu_22) +Chi_10.add(Chimu_10, Chimu_30) +Chi_11.add(Chimu_11, Chimu_31) +Chi_12.add(Chimu_12, Chimu_32) +asmclose() +debugall('TP_PROJ', group='Chi') +curlyclose() +newline() + +#// hspin(0)=fspin(0)-timesI(fspin(3)); +#// hspin(1)=fspin(1)-timesI(fspin(2)); + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// XM_PROJ') +definemultiline(F'XM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_30) +Chi_01.subTimesI(Chimu_01, Chimu_31) +Chi_02.subTimesI(Chimu_02, Chimu_32) +Chi_10.subTimesI(Chimu_10, Chimu_20) +Chi_11.subTimesI(Chimu_11, Chimu_21) +Chi_12.subTimesI(Chimu_12, Chimu_22) +asmclose() +debugall('XM_PROJ sub', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON') +definemultiline(F'XM_RECON_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() + +# only necessary if not zeroed before +if MOVPRFX == False: + result_20.zero() + result_21.zero() + result_22.zero() + result_30.zero() + result_31.zero() + result_32.zero() + + result_20.addTimesI(UChi_10) # <-- + result_21.addTimesI(UChi_11) + result_22.addTimesI(UChi_12) + result_30.addTimesI(UChi_00) + result_31.addTimesI(UChi_01) + result_32.addTimesI(UChi_02) +else: + result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- + result_21.addTimesI(zero0, UChi_11, constructive=True) + result_22.addTimesI(zero0, UChi_12, constructive=True) + result_30.addTimesI(zero0, UChi_00, constructive=True) + result_31.addTimesI(zero0, UChi_01, constructive=True) + result_32.addTimesI(zero0, UChi_02, constructive=True) + +result_00.move(UChi_00) +result_01.move(UChi_01) +result_02.move(UChi_02) +result_10.move(UChi_10) +result_11.move(UChi_11) +result_12.move(UChi_12) +asmclose() +debugall('XM_RECON result', group='result') +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// YM_PROJ') +definemultiline(F'YM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0312_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.add(Chimu_00, Chimu_30) +Chi_01.add(Chimu_01, Chimu_31) +Chi_02.add(Chimu_02, Chimu_32) +Chi_10.sub(Chimu_10, Chimu_20) +Chi_11.sub(Chimu_11, Chimu_21) +Chi_12.sub(Chimu_12, Chimu_22) +asmclose() +debugall('YM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# FCADD issue latency = 1, latency is 6+9 +d['cycles_PROJ'] += 15 * d['factor'] +write('// ZM_PROJ') +definemultiline(F'ZM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +#pg1.loadpredication() +Chi_00.subTimesI(Chimu_00, Chimu_20) +Chi_01.subTimesI(Chimu_01, Chimu_21) +Chi_02.subTimesI(Chimu_02, Chimu_22) +Chi_10.addTimesI(Chimu_10, Chimu_30) +Chi_11.addTimesI(Chimu_11, Chimu_31) +Chi_12.addTimesI(Chimu_12, Chimu_32) +asmclose() +debugall('ZM_PROJ', group='Chi') +curlyclose() +newline() + +d['factor'] = 1 +# add/sub issue latency = 1, latency is 9 +d['cycles_PROJ'] += 9 * d['factor'] +write('// TM_PROJ') +definemultiline(F'TM_PROJ_{PRECSUFFIX}') +if ALTERNATIVE_LOADS == True: + write(' LOAD_CHIMU_0213_PLUG \\') +curlyopen() +asmopen() +pg1.loadpredication() +Chi_00.sub(Chimu_00, Chimu_20) +Chi_01.sub(Chimu_01, Chimu_21) +Chi_02.sub(Chimu_02, Chimu_22) +Chi_10.sub(Chimu_10, Chimu_30) +Chi_11.sub(Chimu_11, Chimu_31) +Chi_12.sub(Chimu_12, Chimu_32) +asmclose() +debugall('TM_PROJ', group='Chi') +curlyclose() +newline() + +# does not occur in GridBench +d['factor'] = 0 +# add/sub issue latency = 1, latency is 9 +d['cycles_RECON'] += 15 * d['factor'] +write('// XM_RECON_ACCUM') +definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +# result_20.addTimesI(UChi_10) +# result_21.addTimesI(UChi_11) +# result_22.addTimesI(UChi_12) +# result_30.addTimesI(UChi_00) +# result_31.addTimesI(UChi_01) +# result_32.addTimesI(UChi_02) +# +# # result_00.move(UChi_00) +# # result_01.move(UChi_01) +# # result_02.move(UChi_02) +# # result_10.move(UChi_10) +# # result_11.move(UChi_11) +# # result_12.move(UChi_12) +# +# # faster than move ? +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) + +result_30.addTimesI(UChi_00) # reordered +result_31.addTimesI(UChi_01) +result_32.addTimesI(UChi_02) + +result_20.addTimesI(UChi_10) +result_21.addTimesI(UChi_11) +result_22.addTimesI(UChi_12) + +result_00.add(UChi_00) +result_01.add(UChi_01) +result_02.add(UChi_02) +result_10.add(UChi_10) +result_11.add(UChi_11) +result_12.add(UChi_12) +asmclose() +debugall('XM_RECON_ACCUM', group='result') +newline() + + + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YP_RECON_ACCUM') +definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_10) +# result_21.add(UChi_11) +# result_22.add(UChi_12) +# result_30.sub(UChi_00) +# result_31.sub(UChi_01) +# result_32.sub(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.sub(UChi_00) + +result_01.add(UChi_01) +result_31.sub(UChi_01) + +result_02.add(UChi_02) +result_32.sub(UChi_02) + +result_10.add(UChi_10) +result_20.add(UChi_10) + +result_11.add(UChi_11) +result_21.add(UChi_11) + +result_12.add(UChi_12) +result_22.add(UChi_12) +asmclose() +debugall('YP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// YM_RECON_ACCUM') +definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_10) +# result_21.sub(UChi_11) +# result_22.sub(UChi_12) +# result_30.add(UChi_00) +# result_31.add(UChi_01) +# result_32.add(UChi_02) + +result_00.add(UChi_00) # reordered +result_30.add(UChi_00) + +result_01.add(UChi_01) +result_31.add(UChi_01) + +result_02.add(UChi_02) +result_32.add(UChi_02) + +result_10.add(UChi_10) +result_20.sub(UChi_10) + +result_11.add(UChi_11) +result_21.sub(UChi_11) + +result_12.add(UChi_12) +result_22.sub(UChi_12) +asmclose() +debugall('YM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZP_RECON_ACCUM') +definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.subTimesI(UChi_00) +# result_21.subTimesI(UChi_01) +# result_22.subTimesI(UChi_02) +# result_30.addTimesI(UChi_10) +# result_31.addTimesI(UChi_11) +# result_32.addTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.subTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.subTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.subTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.addTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.addTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.addTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 15 * d['factor'] +write('// ZM_RECON_ACCUM') +definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_20.addTimesI(UChi_00) +# result_21.addTimesI(UChi_01) +# result_22.addTimesI(UChi_02) +# result_30.subTimesI(UChi_10) +# result_31.subTimesI(UChi_11) +# result_32.subTimesI(UChi_12) +# +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +result_20.addTimesI(UChi_00) # reordered +result_00.add(UChi_00) + +result_21.addTimesI(UChi_01) +result_01.add(UChi_01) + +result_22.addTimesI(UChi_02) +result_02.add(UChi_02) + +result_30.subTimesI(UChi_10) +result_10.add(UChi_10) + +result_31.subTimesI(UChi_11) +result_11.add(UChi_11) + +result_32.subTimesI(UChi_12) +result_12.add(UChi_12) +asmclose() +debugall('ZM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TP_RECON_ACCUM') +definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.add(UChi_00) +# result_21.add(UChi_01) +# result_22.add(UChi_02) +# result_30.add(UChi_10) +# result_31.add(UChi_11) +# result_32.add(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.add(UChi_00) + +result_01.add(UChi_01) +result_21.add(UChi_01) + +result_02.add(UChi_02) +result_22.add(UChi_02) + +result_10.add(UChi_10) +result_30.add(UChi_10) + +result_11.add(UChi_11) +result_31.add(UChi_11) + +result_12.add(UChi_12) +result_32.add(UChi_12) +asmclose() +debugall('TP_RECON_ACCUM', group='result') +newline() + +d['factor'] = 1 +d['cycles_RECON'] += 9 * d['factor'] +write('// TM_RECON_ACCUM') +definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') +asmopen() +#pg1.loadpredication() +# result_00.add(UChi_00) +# result_01.add(UChi_01) +# result_02.add(UChi_02) +# result_10.add(UChi_10) +# result_11.add(UChi_11) +# result_12.add(UChi_12) +# result_20.sub(UChi_00) +# result_21.sub(UChi_01) +# result_22.sub(UChi_02) +# result_30.sub(UChi_10) +# result_31.sub(UChi_11) +# result_32.sub(UChi_12) + +result_00.add(UChi_00) # reordered +result_20.sub(UChi_00) + +result_01.add(UChi_01) +result_21.sub(UChi_01) + +result_02.add(UChi_02) +result_22.sub(UChi_02) + +result_10.add(UChi_10) +result_30.sub(UChi_10) + +result_11.add(UChi_11) +result_31.sub(UChi_11) + +result_12.add(UChi_12) +result_32.sub(UChi_12) +asmclose() +debugall('TM_RECON_ACCUM', group='result') +newline() + +d['factor'] = 0 +# have 12 instructions +# picking dual issue versions +d['cycles_ZERO_PSI'] += 6 * d['factor'] +write('// ZERO_PSI') +definemultiline(F'ZERO_PSI_{PRECSUFFIX}') +asmopen() +pg1.loadpredication() +result_00.zero() +result_01.zero() +result_02.zero() +result_10.zero() +result_11.zero() +result_12.zero() +result_20.zero() +result_21.zero() +result_22.zero() +result_30.zero() +result_31.zero() +result_32.zero() +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +# prefetch store spinors to L2 cache +d['factor'] = 0 +d['cycles_PREFETCH_L2'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') +definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L2_store(F"base", 0) +prefetch_L2_store(F"base", 1) +prefetch_L2_store(F"base", 2) +asmclose() +curlyclose() +newline() + +# prefetch store spinors to L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') +definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L1_store(F"base", 0) +prefetch_L1_store(F"base", 1) +prefetch_L1_store(F"base", 2) +asmclose() +curlyclose() +newline() + + +d['factor'] = 0 +write('// ADD_RESULT_INTERNAL') +definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') +asmopen() +result_00.add(Chimu_00) +result_01.add(Chimu_01) +result_02.add(Chimu_02) +result_10.add(Chimu_10) +result_11.add(Chimu_11) +result_12.add(Chimu_12) +result_20.add(Chimu_20) +result_21.add(Chimu_21) +result_22.add(Chimu_22) +result_30.add(Chimu_30) +result_31.add(Chimu_31) +result_32.add(Chimu_32) +asmclose() +#debugall('ZERO_PSI', group='result') +newline() + +# -------------------------------------------------------------------------------- + +# C +f = open('w.h', 'w') +f.write(d['C']) +f.close() + +# intrin +f = open('wi.h', 'w') +f.write(d['I']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_intrin_double.h" +else: + filename = "Fujitsu_A64FX_intrin_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['I']) +f.close() + + +# asm +f = open('wa.h', 'w') +f.write(d['A']) +f.close() + +filename = '' +if PRECISION == 'double': + filename = "Fujitsu_A64FX_asm_double.h" +else: + filename = "Fujitsu_A64FX_asm_single.h" +f = open(filename, 'w') +f.write(LEGAL.format(filename)) +f.write(d['A']) +f.close() + + +# arithmetics instruction count, mul/mac = 2 instructions each +d['acount'] = d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] + +# permutations +d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] +d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] + +# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each +d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ + d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ + d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] + +# flops +d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ + d['addTimesI'] + d['subTimesI'] + + + + + +print('Statistics') +print('') +print('Type Occurences Total / Arith instructions') +print('-------------------------------------------------------------------') +print('Variables {:4d}'.format(d['registers'])) +print('') +print('load {:4d}'.format(d['load'])) +print('store {:4d}'.format(d['store'])) +print('move {:4d}'.format(d['move'])) +print('movprfx {:4d}'.format(d['movprfx'])) +print('zero {:4d}'.format(d['zero'])) +print('negate {:4d}'.format(d['neg'])) + + +print('add {:4d} {:0.2f} / {:0.2f}'.\ + format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) +print('sub {:4d} {:0.2f} / {:0.2f}'.\ + format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) +print('mul {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) +print('mac {:4d} {:0.2f} / {:0.2f}'.\ + format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) +print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) +print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ + format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) + +print('timesI {:4d}'.format(d['timesI'])) +print('timesMinusI {:4d}'.format(d['timesMinusI'])) +print('permutes {:4d} {:0.2f}'.\ + format(d['permutes'], d['permutes'] / d['icount'])) +print('') +print('flops {:4d}'.format(d['flops'])) +print('instruction count {:4d}'.format(d['icount'])) +print('arith. instruction count {:4d} {:0.2f}'.\ + format(d['acount'], d['acount'] / d['icount'])) + + +# ---- static pipeline resources consumption ---- +FLA = 0 +FLA += 2 * d['mac'] + 2 * d['mul'] +FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] +FLA += 1 * d['move'] +FLA += 1 * d['permutes'] +FLA += 1 * d['store'] +FLA += 1 * d['zero'] + +FLB = 0 +FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] + +FLAB = 0 +FLAB += 1 * d['mac'] + 1 * d['mul'] +FLAB += 1 * d['add'] + 1 * d['sub'] +FLAB += 1 * d['neg'] + 1 * d['movprfx'] +#FLAB += 1 * d['zero'] + + +FL_slots = 2 * d['icount'] +FL_micro_ops = FLA + FLB + FLAB + +print('') +print('------------------------------------------------------------------') +print('') +print('Static FL slot usage') +print('') +print(' FLA {:4d}'.format(FLA)) +print(' FLB {:4d}'.format(FLB)) +print(' FLA/B {:4d}'.format(FLAB)) + +print('') +print('Static FL slot efficiency') +print('') +print(' Total FL slots {:4d}'.format(FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) + +cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ + d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + d['cycles_RESULT'] +cycles_total_hidden = d['cycles_ZERO_PSI'] + \ + d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ + d['cycles_RECON'] + +# ---- dynamic estimate ---- + +print('') +print('Dynamic cycles estimate (incl. latencies)') +print('') +print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) +print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) +print(' PROJ {:4d}'.format(d['cycles_PROJ'])) +print(' PERM {:4d}'.format(d['cycles_PERM'])) +print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) +print(' RECON {:4d}'.format(d['cycles_RECON'])) +print(' STORE {:4d}'.format(d['cycles_RESULT'])) +print('') +print(' Sum {:4d}'.format(cycles_total)) +print('') +print(' Sum* {:4d}'.format(cycles_total_hidden)) +print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) +print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) +print('') +print(' *load/store/PERM hidden') + +estimated_cycles = cycles_total_hidden +# Estimate percent peak DP; dual issue, fma +pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) +print('') +print('Model prediction') +print('') +print(' Cycles* {:4d}'.format(estimated_cycles)) +print(' Percent peak* {:4.1f} %'.format(pp)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles +tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles +print('') +print(' Estimated RF throughput* {:4.1f} GB/s'.\ + format(tp10)) +print(' Estimated RF throughput* {:4.1f} GiB/s'.\ + format(tp2)) + +# ---- dynamic pipeline resources consumption ---- + +runtime = measured_cycles # runtime in cycles +pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) +runtime_FL_slots = 2 * runtime +delta = runtime - estimated_cycles + + +print('') +print('------------------------------------------------------------------') +print('') +print('Dynamic runtime analysis (cycles from measurements)') +print('') +print(' Cycles {:4d}'.format(runtime)) +print(' Percent peak {:4.1f} %'.format(pp_runtime)) +print(' Deviation from estimate {:4d} {:4.2f} %'.\ + format(delta, 100. * abs(delta/runtime))) +print(' Deviation per direction {:4.1f}'.format(delta/8)) + +# estimated RF throughput in GB/s @ 2.2 GHz +tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime +tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime +print('') +print(' RF throughput {:4.1f} GB/s'.\ + format(tp10_rt)) +print(' RF throughput {:4.1f} GiB/s'.\ + format(tp2_rt)) +print('') +print(' Total FL slots {:4d}'.format(runtime_FL_slots)) +print(' FL slots occupied {:4d}'.format(FL_micro_ops)) +print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) +print('') diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 37b866cb..1e198972 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -1,6 +1,6 @@ /************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Stencil.h @@ -41,13 +41,13 @@ // Stencil based code will exchange haloes and use a table lookup for neighbours. // This will be done with generality to allow easier efficient implementations. // Overlap of comms and compute is enabled by tabulating off-node connected, -// +// // Generic services // 0) Prebuild neighbour tables // 1) Compute sizes of all haloes/comms buffers; allocate them. // 2) Gather all faces, and communicate. // 3) Loop over result sites, giving nbr index/offnode info for each -// +// ////////////////////////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(Grid); @@ -59,15 +59,16 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,Vector > & table); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); -template +template void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(); + + auto rhs_v = rhs.View(AcceleratorRead); accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; compressed_t tmp_c; @@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector >& table,const Lattice compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); coalescedWrite(buffer[off+o],tmp_c); }); + rhs_v.ViewClose(); // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table } @@ -92,37 +94,38 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic { assert( (table.size()&0x1)==0); int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; - accelerator_forNB(j, num, 1, { + accelerator_forNB(j, num, 1, { compress.CompressExchange(p0,p1, &rhs_v[0], j, so+tp[2*j ].second, so+tp[2*j+1].second, type); }); + rhs_v.ViewClose(); } -struct StencilEntry { -#ifdef GRID_NVCC - uint64_t _byte_offset; // 8 bytes - uint32_t _offset; // 4 bytes +struct StencilEntry { +#ifdef GRID_CUDA + uint64_t _byte_offset; // 8 bytes + uint32_t _offset; // 4 bytes #else - uint64_t _byte_offset; // 8 bytes + uint64_t _byte_offset; // 8 bytes uint64_t _offset; // 8 bytes (8 ever required?) #endif - uint8_t _is_local; // 1 bytes + uint8_t _is_local; // 1 bytes uint8_t _permute; // 1 bytes uint8_t _around_the_world; // 1 bytes uint8_t _pad; // 1 bytes }; -// Could pack to 8 + 4 + 4 = 128 bit and use +// Could pack to 8 + 4 + 4 = 128 bit and use template -class CartesianStencilView { +class CartesianStencilAccelerator { public: typedef AcceleratorVector StencilVector; @@ -130,14 +133,15 @@ class CartesianStencilView { //////////////////////////////////////// // Basic Grid and stencil info //////////////////////////////////////// - int _checkerboard; - int _npoints; // Move to template param? + int _checkerboard; + int _npoints; // Move to template param? + int _osites; StencilVector _directions; StencilVector _distances; StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; - Coordinate _simd_layout; + Coordinate _simd_layout; Parameters parameters; StencilEntry* _entries_p; cobj* u_recv_buf_p; @@ -145,18 +149,18 @@ class CartesianStencilView { accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { - ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; - if (perm) ptype = this->_permute_type[point]; + if (perm) ptype = this->_permute_type[point]; if (local) { return base + this->_entries_p[ent]._byte_offset; } else { @@ -171,17 +175,47 @@ class CartesianStencilView { else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } +}; + +template +class CartesianStencilView : public CartesianStencilAccelerator +{ + private: + int *closed; + StencilEntry *cpu_ptr; + ViewMode mode; + public: + // default copy constructor + CartesianStencilView (const CartesianStencilView &refer_to_me) = default; + + CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode _mode) + : CartesianStencilAccelerator(refer_to_me), + cpu_ptr(this->_entries_p), + mode(_mode) + { + this->_entries_p =(StencilEntry *) + MemoryManager::ViewOpen(this->_entries_p, + this->_npoints*this->_osites*sizeof(StencilEntry), + mode, + AdviseDefault); + } + + void ViewClose(void) + { + MemoryManager::ViewClose(this->cpu_ptr,this->mode); + } }; + //////////////////////////////////////// // The Stencil Class itself //////////////////////////////////////// template -class CartesianStencil : public CartesianStencilView { // Stencil runs along coordinate axes only; NO diagonal fill in. +class CartesianStencil : public CartesianStencilAccelerator { // Stencil runs along coordinate axes only; NO diagonal fill in. public: typedef typename cobj::vector_type vector_type; @@ -211,12 +245,12 @@ public: cobj * mpi_p; Integer buffer_size; }; - + protected: GridBase * _grid; -public: +public: GridBase *Grid(void) const { return _grid; } //////////////////////////////////////////////////////////////////////// @@ -226,11 +260,11 @@ public: // Generalise as required later if needed //////////////////////////////////////////////////////////////////////// - View_type View(void) const { - View_type accessor(*( (View_type *) this)); + View_type View(ViewMode mode) const { + View_type accessor(*( (View_type *) this),mode); return accessor; } - + int face_table_computed; std::vector > > face_table ; Vector surface_list; @@ -280,7 +314,7 @@ public: //////////////////////////////////////// // Stencil query //////////////////////////////////////// - inline int SameNode(int point) { + inline int SameNode(int point) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; @@ -304,7 +338,7 @@ public: // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); // Present hack only works for >= 4^4 subvol per node - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); @@ -344,7 +378,7 @@ public: comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; } } - + void CollateThreads(void) { int nthreads = CartesianCommunicator::nCommThreads; @@ -368,7 +402,7 @@ public: if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen if ( t1 > last ) last = t1; // max time seen - + } commtime+= last-first; } @@ -430,30 +464,30 @@ public: this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); } - } - - template void HaloExchange(const Lattice &source,compressor &compress) + } + + template void HaloExchange(const Lattice &source,compressor &compress) { Prepare(); HaloGather(source,compress); Communicate(); - CommsMergeSHM(compress); - CommsMerge(compress); + CommsMergeSHM(compress); + CommsMerge(compress); } - + template int HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) { int dimension = this->_directions[point]; int displacement = this->_distances[point]; - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (displacement+fd)%fd; assert (source.Checkerboard()== this->_checkerboard); - + // the permute type int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; @@ -471,7 +505,7 @@ public: auto tmp = GatherSimd(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; splicetime+=usecond(); - } else { + } else { nosplicetime-=usecond(); auto tmp = Gather(source,dimension,shift,0x3,compress,face_idx); is_same_node = is_same_node && tmp; @@ -497,7 +531,7 @@ public: } return is_same_node; } - + template void HaloGather(const Lattice &source,compressor &compress) { @@ -508,9 +542,9 @@ public: // conformable(source.Grid(),_grid); assert(source.Grid()==_grid); halogtime-=usecond(); - + u_comm_offset=0; - + // Gather all comms buffers int face_idx=0; for(int point = 0 ; point < this->_npoints; point++) { @@ -523,16 +557,16 @@ public: accelerator_barrier(); halogtime+=usecond(); } - + ///////////////////////// // Implementation ///////////////////////// void Prepare(void) { - Decompressions.resize(0); - DecompressionsSHM.resize(0); - Mergers.resize(0); - MergersSHM.resize(0); + Decompressions.resize(0); + DecompressionsSHM.resize(0); + Mergers.resize(0); + MergersSHM.resize(0); Packets.resize(0); calls++; } @@ -561,22 +595,22 @@ public: mv.push_back(m); } template void CommsMerge(decompressor decompress) { - CommsMerge(decompress,Mergers,Decompressions); + CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { - mpi3synctime-=usecond(); + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes - mpi3synctime+=usecond(); - shmmergetime-=usecond(); + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); - shmmergetime+=usecond(); + shmmergetime+=usecond(); } template - void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { mergetime-=usecond(); - for(int i=0;i_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ local = 0; } } - if(local == 0) { + if(local == 0) { surface_list.push_back(site); } } @@ -638,11 +672,11 @@ public: int checkerboard, const std::vector &directions, const std::vector &distances, - Parameters p) - : shm_bytes_thr(npoints), - comm_bytes_thr(npoints), + Parameters p) + : shm_bytes_thr(npoints), + comm_bytes_thr(npoints), comm_enter_thr(npoints), - comm_leave_thr(npoints), + comm_leave_thr(npoints), comm_time_thr(npoints) { face_table_computed=0; @@ -653,7 +687,7 @@ public: ///////////////////////////////////// this->_npoints = npoints; this->_comm_buf_size.resize(npoints), - this->_permute_type.resize(npoints), + this->_permute_type.resize(npoints), this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels this->_directions = StencilVector(directions); this->_distances = StencilVector(distances); @@ -662,25 +696,25 @@ public: _unified_buffer_size=0; surface_list.resize(0); - int osites = _grid->oSites(); - - _entries.resize(this->_npoints* osites); + this->_osites = _grid->oSites(); + + _entries.resize(this->_npoints* this->_osites); this->_entries_p = &_entries[0]; for(int ii=0;ii_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); - + this->_checkerboard = checkerboard; - + ////////////////////////// // the permute type ////////////////////////// @@ -690,25 +724,25 @@ public: int rotate_dim = _grid->_simd_layout[dimension]>2; assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported - + int sshift[2]; - + ////////////////////////// // Underlying approach. For each local site build - // up a table containing the npoint "neighbours" and whether they + // up a table containing the npoint "neighbours" and whether they // live in lattice or a comms buffer. ////////////////////////// if ( !comm_dim ) { sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); - + if ( sshift[0] == sshift[1] ) { Local(point,dimension,shift,0x3); } else { Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Local(point,dimension,shift,0x2);// both with block stride loop iteration } - } else { + } else { // All permute extract done in comms phase prior to Stencil application // So tables are the same whether comm_dim or splice_dim sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); @@ -750,23 +784,23 @@ public: int ld = _grid->_ldimensions[dimension]; int gd = _grid->_gdimensions[dimension]; int ly = _grid->_simd_layout[dimension]; - + // Map to always positive shift modulo global full dimension. int shift = (shiftpm+fd)%fd; // the permute type int permute_dim =_grid->PermuteDim(dimension); - - for(int x=0;x_ostride[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; - + int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); int sx = (x+sshift)%rd; - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) ) { wraparound = 1; @@ -774,7 +808,7 @@ public: if ( (shiftpm== 1) && (sxNsimd(); - + int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; - + assert(comm_dim==1); int shift = (shiftpm + fd) %fd; assert(shift>=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and // send to one or more remote nodes. - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); - - for(int x=0;xPermuteType(dimension); - + int sx = (x+sshift)%rd; - + int offnode = 0; if ( simd_layout > 1 ) { - + for(int i=0;i>(permute_type+1)); int ic= (i&inner_bit)? 1:0; int my_coor = rd*ic + x; int nbr_coor = my_coor+sshift; int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - if ( nbr_proc ) { + + if ( nbr_proc ) { offnode =1; } } - - } else { + + } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); } - + int wraparound=0; if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -850,24 +884,24 @@ public: wraparound = 1; } if (!offnode) { - + int permute_slice=0; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + } else { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - + // int rank = grid->_processor; // int recv_from_rank; // int xmit_to_rank; - + int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - + ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - + } } } @@ -875,13 +909,13 @@ public: void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - + int o = 0; // relative offset to base within plane - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*_grid->_ostride[dimension]; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -893,18 +927,18 @@ public: } o +=_grid->_slice_stride[dimension]; } - + } else { - - int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); - + if ( ocb&cbmask ) { int idx = point+(lo+o+b)*this->_npoints; _entries[idx]._offset =ro+o+b; @@ -912,24 +946,24 @@ public: _entries[idx]._permute=permute; _entries[idx]._around_the_world=wrap; } - + } o +=_grid->_slice_stride[dimension]; } - + } } // Routine builds up integer table for each site in _offsets, _is_local, _permute void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap) { int rd = _grid->_rdimensions[dimension]; - + if ( !_grid->CheckerBoarded(dimension) ) { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + // Simple block stride gather of SIMD objects for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ @@ -941,16 +975,16 @@ public: } o +=_grid->_slice_stride[dimension]; } - - } else { - - int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane int o = 0; // relative offset to base within plane int bo = 0; // offset in buffer - + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ - + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { int idx = point+(so+o+b)*this->_npoints; @@ -964,16 +998,16 @@ public: } } } - + template int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); - + int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int pd = _grid->_processors[dimension]; @@ -985,37 +1019,37 @@ public: assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; - + int cb= (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + int shm_receive_only = 1; - for(int x=0;x>1; - + int bytes = words * compress.CommDatumSize(); - - int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); } - + // int rank = _grid->_processor; int recv_from_rank; int xmit_to_rank; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - + assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - + ///////////////////////////////////////////////////////// // try the direct copy if possible ///////////////////////////////////////////////////////// @@ -1028,13 +1062,13 @@ public: } send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); - if ( send_buf==NULL ) { + if ( send_buf==NULL ) { send_buf = this->u_send_buf_p; - } - + } + // Find out if we get the direct copy. void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p); - if (success==NULL) { + if (success==NULL) { // we found a packet that comes from MPI and contributes to this leg of stencil shm_receive_only = 0; } @@ -1043,9 +1077,9 @@ public: assert(send_buf!=NULL); Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); - + if ( compress.DecompressionStep() ) { - + if ( shm_receive_only ) { // Early decompress before MPI is finished is possible AddDecompress(&this->u_recv_buf_p[u_comm_offset], &recv_buf[u_comm_offset], @@ -1074,7 +1108,7 @@ public: } return shm_receive_only; } - + template int GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) { @@ -1102,7 +1136,7 @@ public: /////////////////////////////////////////////// int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // int words = sizeof(cobj)/sizeof(vector_type); - + assert(cbmask==0x3); // Fixme think there is a latent bug if not true // This assert will trap it if ever hit. Not hit normally so far int reduced_buffer_size = buffer_size; @@ -1118,22 +1152,22 @@ public: /////////////////////////////////////////// // Work out what to send where /////////////////////////////////////////// - + int cb = (cbmask==0x2)? Odd : Even; int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + // loop over outer coord planes orthog to dim int shm_receive_only = 1; - for(int x=0;x= rd ); if ( any_offnode ) { - - for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - + + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + // shm == receive pointer if offnode // shm == Translate[send pointer] if on node -- my view of his send pointer cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { + if (shm==NULL) { shm = rp; // we found a packet that comes from MPI and contributes to this shift. // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. @@ -1188,15 +1222,15 @@ public: AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); - - } else { - + + } else { + rpointers[i] = sp; - + } } - if ( shm_receive_only ) { + if ( shm_receive_only ) { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); } else { AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); @@ -1231,9 +1265,9 @@ public: shm_bytes = 0.; calls = 0.; }; - + void Report(void) { -#define AVERAGE(A) +#define AVERAGE(A) #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); @@ -1250,7 +1284,7 @@ public: } } if (threaded) commtime += t; - + _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { std::cout << GridLogMessage << " Stencil calls "< NAMESPACE_BEGIN(Grid); -//accelerator_inline void SIMTsynchronise(void) -accelerator_inline void synchronise(void) -{ -#ifdef __CUDA_ARCH__ -// __syncthreads(); - __syncwarp(); -#endif - return; -} -#ifndef __CUDA_ARCH__ +#ifndef GRID_SIMT ////////////////////////////////////////// // Trivial mapping of vectors on host ////////////////////////////////////////// -accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific - template accelerator_inline vobj coalescedRead(const vobj & __restrict__ vec,int lane=0) { @@ -66,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0) { - // vstream(vec, extracted); vec = extracted; } template accelerator_inline @@ -75,25 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ vstream(vec, extracted); } #else -accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// template accelerator_inline -typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd())) +typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd())) { return extractLane(lane,vec); } template accelerator_inline -typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd())) +typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd())) { int mask = vobj::Nsimd() >> (ptype + 1); int plane= doperm ? lane ^ mask : lane; return extractLane(plane,vec); } template accelerator_inline -void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd())) +void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd())) { insertLane(lane,vec,extracted); } diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index dbcbae8d..36becc49 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -59,6 +59,20 @@ class GridTensorBase {}; using DoublePrecision2= typename Traits::DoublePrecision2; \ static constexpr int TensorLevel = Traits::TensorLevel +/////////////////////////////////////////////////////////// +// Allows to turn scalar>>> back to double. +/////////////////////////////////////////////////////////// +template +accelerator_inline typename std::enable_if::value, T>::type +TensorRemove(T arg) { + return arg; +} +template +accelerator_inline auto TensorRemove(iScalar arg) + -> decltype(TensorRemove(arg._internal)) { + return TensorRemove(arg._internal); +} + template class iScalar { public: @@ -135,9 +149,10 @@ public: operator ComplexD() const { return (TensorRemove(_internal)); } + // instantiation of "Grid::iScalar::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, =0, =0U]" template = 0,IfNotSimd = 0> accelerator_inline operator RealD() const { - return TensorRemove(_internal); + return (RealD) TensorRemove(_internal); } template = 0, IfNotSimd = 0> accelerator_inline operator Integer() const { @@ -169,20 +184,6 @@ public: strong_inline scalar_type * end() { return begin() + Traits::count; } }; -/////////////////////////////////////////////////////////// -// Allows to turn scalar>>> back to double. -/////////////////////////////////////////////////////////// -template -accelerator_inline typename std::enable_if::value, T>::type -TensorRemove(T arg) { - return arg; -} -template -accelerator_inline auto TensorRemove(iScalar arg) - -> decltype(TensorRemove(arg._internal)) { - return TensorRemove(arg._internal); -} - template class iVector { public: diff --git a/Grid/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h index 11d37f9c..0a1d6389 100644 --- a/Grid/tensors/Tensor_exp.h +++ b/Grid/tensors/Tensor_exp.h @@ -55,7 +55,7 @@ template accelerator_inline iVector Exponentiate(c // Specialisation: Cayley-Hamilton exponential for SU(3) -#ifndef GRID_NVCC +#ifndef GRID_CUDA template::TensorLevel == 0>::type * =nullptr> accelerator_inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) { diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc new file mode 100644 index 00000000..2c4ad9df --- /dev/null +++ b/Grid/threads/Accelerator.cc @@ -0,0 +1,207 @@ +#include + +NAMESPACE_BEGIN(Grid); +uint32_t accelerator_threads=2; +uint32_t acceleratorThreads(void) {return accelerator_threads;}; +void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; + +#ifdef GRID_CUDA +cudaDeviceProp *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + cudaGetDeviceCount(&nDevices); + gpu_props = new cudaDeviceProp[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + size_t totalDeviceMem=0; + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + cudaGetDeviceProperties(&gpu_props[i], i); + cudaDeviceProp prop; + prop = gpu_props[i]; + totalDeviceMem = prop.totalGlobalMem; + if ( world_rank == 0) { + printf("AcceleratorCudaInit: ========================\n"); + printf("AcceleratorCudaInit: Device Number : %d\n", i); + printf("AcceleratorCudaInit: ========================\n"); + printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name); + + GPU_PROP_FMT(totalGlobalMem,"%lld"); + GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } + MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours +#undef GPU_PROP_FMT +#undef GPU_PROP + +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n"); + cudaSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); +} +#endif + +#ifdef GRID_HIP +hipDeviceProp_t *gpu_props; +void acceleratorInit(void) +{ + int nDevices = 1; + hipGetDeviceCount(&nDevices); + gpu_props = new hipDeviceProp_t[nDevices]; + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + for (int i = 0; i < nDevices; i++) { + +#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); +#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); + + hipGetDeviceProperties(&gpu_props[i], i); + if ( world_rank == 0) { + hipDeviceProp_t prop; + prop = gpu_props[i]; + printf("AcceleratorHipInit: ========================\n"); + printf("AcceleratorHipInit: Device Number : %d\n", i); + printf("AcceleratorHipInit: ========================\n"); + printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); + + // GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); + // GPU_PROP(unifiedAddressing); + // GPU_PROP(l2CacheSize); + // GPU_PROP(singleToDoublePrecisionPerfRatio); + } + } +#undef GPU_PROP_FMT +#undef GPU_PROP +#ifdef GRID_IBM_SUMMIT + // IBM Jsrun makes cuda Device numbering screwy and not match rank + if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); +#else + if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); + hipSetDevice(rank); +#endif + if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); +} +#endif + + +#ifdef GRID_SYCL + +cl::sycl::queue *theGridAccelerator; + +void acceleratorInit(void) +{ + int nDevices = 1; + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theGridAccelerator = new sycl::queue (selectedDevice); + + char * localRankStr = NULL; + int rank = 0, world_rank=0; +#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" +#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" +#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" +#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) + { + rank = atoi(localRankStr); + } + if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + + auto devices = cl::sycl::device::get_devices(); + for(int d = 0;d().c_str()); + +#define GPU_PROP_FMT(prop,FMT) \ + printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); + +#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); + // GPU_PROP_STR(device_type); + /* + GPU_PROP(max_compute_units); + GPU_PROP(native_vector_width_char); + GPU_PROP(native_vector_width_short); + GPU_PROP(native_vector_width_int); + GPU_PROP(native_vector_width_long); + GPU_PROP(native_vector_width_float); + GPU_PROP(native_vector_width_double); + GPU_PROP(native_vector_width_half); + GPU_PROP(address_bits); + GPU_PROP(half_fp_config); + GPU_PROP(single_fp_config); + */ + // GPU_PROP(double_fp_config); + GPU_PROP(global_mem_size); + + } + if ( world_rank == 0 ) { + auto name = theGridAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); + printf("AcceleratorSyclInit: ================================================\n"); + } +} +#endif + +#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP)) +void acceleratorInit(void){} +#endif + +NAMESPACE_END(Grid); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h new file mode 100644 index 00000000..74a3ea22 --- /dev/null +++ b/Grid/threads/Accelerator.h @@ -0,0 +1,426 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/Accelerator.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#include + +#ifdef HAVE_MALLOC_MALLOC_H +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif +#ifdef HAVE_MM_MALLOC_H +#include +#endif + +NAMESPACE_BEGIN(Grid); + +////////////////////////////////////////////////////////////////////////////////// +// Accelerator primitives; fall back to threading if not CUDA or SYCL +////////////////////////////////////////////////////////////////////////////////// +// +// Function attributes +// +// accelerator +// accelerator_inline +// +// Parallel looping +// +// accelerator_for +// accelerator_forNB +// uint32_t accelerator_barrier(); // device synchronise +// +// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd +// +// uint32_t acceleratorThreads(void); +// void acceleratorThreads(uint32_t); +// +// Warp control and info: +// +// acceleratorInit; +// void acceleratorSynchronise(void); // synch warp etc.. +// int acceleratorSIMTlane(int Nsimd); +// +// Memory management: +// +// void *acceleratorAllocShared(size_t bytes); +// void acceleratorFreeShared(void *ptr); +// +// void *acceleratorAllocDevice(size_t bytes); +// void acceleratorFreeDevice(void *ptr); +// +// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes); +// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes); +// +////////////////////////////////////////////////////////////////////////////////// + +uint32_t acceleratorThreads(void); +void acceleratorThreads(uint32_t); +void acceleratorInit(void); + +////////////////////////////////////////////// +// CUDA acceleration +////////////////////////////////////////////// + +#ifdef GRID_CUDA + +#ifdef __CUDA_ARCH__ +#define GRID_SIMT +#endif + +#define accelerator __host__ __device__ +#define accelerator_inline __host__ __device__ inline + +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return threadIdx.z; +#else + return 0; +#endif +} // CUDA specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + int nt=acceleratorThreads(); \ + dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } + +template __global__ +void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) +{ + uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; + uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; + uint64_t z = threadIdx.z; + if ( (x < num1) && (y +#include +NAMESPACE_BEGIN(Grid); + +extern cl::sycl::queue *theGridAccelerator; + +#ifdef __SYCL_DEVICE_ONLY__ +#define GRID_SIMT +#endif + +#define accelerator +#define accelerator_inline strong_inline + +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; +#else + return 0; +#endif +} // SYCL specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ + unsigned long nt=acceleratorThreads(); \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1,unum2,nsimd}; \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) mutable { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { __VA_ARGS__ }; \ + }); \ + }); + +#define accelerator_barrier(dummy) theGridAccelerator->wait(); + +inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; +inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} + +#endif + +////////////////////////////////////////////// +// HIP acceleration +////////////////////////////////////////////// +#ifdef GRID_HIP +NAMESPACE_END(Grid); +#include +NAMESPACE_BEGIN(Grid); + +#ifdef __HIP_DEVICE_COMPILE__ +#define GRID_SIMT +#endif + +#define accelerator __host__ __device__ +#define accelerator_inline __host__ __device__ inline + +/*These routines define mapping from thread grid to loop & vector lane indexing */ +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return hipThreadIdx_z; +#else + return 0; +#endif +} // HIP specific + +#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \ + { __VA_ARGS__;} \ + }; \ + int nt=acceleratorThreads(); \ + dim3 hip_threads(nt,1,nsimd); \ + dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \ + hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \ + 0,0, \ + num1,num2,nsimd,lambda); \ + } + +template __global__ +void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) +{ + uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x; + uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y; + uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z; + if ( (x < numx) && (y /* END LEGAL */ #pragma once -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#define MIN(x,y) ((x)>(y)?(y):(x)) -#endif - -#define strong_inline __attribute__((always_inline)) inline -#define UNROLL _Pragma("unroll") - -////////////////////////////////////////////////////////////////////////////////// -// New primitives; explicit host thread calls, and accelerator data parallel calls -////////////////////////////////////////////////////////////////////////////////// - -#ifdef _OPENMP -#define GRID_OMP -#include -#endif - -#ifdef GRID_OMP -#define DO_PRAGMA_(x) _Pragma (#x) -#define DO_PRAGMA(x) DO_PRAGMA_(x) -#define thread_num(a) omp_get_thread_num() -#define thread_max(a) omp_get_max_threads() -#else -#define DO_PRAGMA_(x) -#define DO_PRAGMA(x) -#define thread_num(a) (0) -#define thread_max(a) (1) -#endif - -#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i __global__ -void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda) -{ - uint64_t isite = threadIdx.y; - uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x; - if ( (osite >>(nsimd,num,lambda); \ - } - -// Copy the for_each_n style ; Non-blocking variant (default -#define accelerator_for( iterator, num, nsimd, ... ) \ - accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \ - accelerator_barrier(dummy); - -#else - -#define accelerator -#define accelerator_inline strong_inline -#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); -#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); -#define accelerator_barrier(dummy) - -#endif +#include +#include diff --git a/Grid/threads/ThreadReduction.h b/Grid/threads/ThreadReduction.h new file mode 100644 index 00000000..f0d24d50 --- /dev/null +++ b/Grid/threads/ThreadReduction.h @@ -0,0 +1,127 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/ThreadReduction.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +// Introduce a class to gain deterministic bit reproducible reduction. +// make static; perhaps just a namespace is required. +NAMESPACE_BEGIN(Grid); + +class GridThread { +public: + static int _threads; + static int _hyperthreads; + static int _cores; + + static void SetCores(int cr) { +#ifdef GRID_OMP + _cores = cr; +#else + _cores = 1; +#endif + } + static void SetThreads(int thr) { +#ifdef GRID_OMP + _threads = MIN(thr,omp_get_max_threads()) ; + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static void SetMaxThreads(void) { +#ifdef GRID_OMP + _threads = omp_get_max_threads(); + omp_set_num_threads(_threads); +#else + _threads = 1; +#endif + }; + static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; }; + static int GetCores(void) { return _cores; }; + static int GetThreads(void) { return _threads; }; + static int SumArraySize(void) {return _threads;}; + + static void GetWork(int nwork, int me, int & mywork, int & myoff){ + GetWork(nwork,me,mywork,myoff,_threads); + } + static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){ + int basework = nwork/units; + int backfill = units-(nwork%units); + if ( me >= units ) { + mywork = myoff = 0; + } else { + mywork = (nwork+me)/units; + myoff = basework * me; + if ( me > backfill ) + myoff+= (me-backfill); + } + return; + }; + + static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){ + me = ThreadBarrier(); + GetWork(nwork,me,mywork,myoff); + }; + + static int ThreadBarrier(void) { +#ifdef GRID_OMP +#pragma omp barrier + return omp_get_thread_num(); +#else + return 0; +#endif + }; + + template static void ThreadSum( std::vector &sum_array,obj &val,int me){ + sum_array[me] = val; + val=Zero(); + ThreadBarrier(); + for(int i=0;i<_threads;i++) val+= sum_array[i]; + ThreadBarrier(); + } + + static void bcopy(const void *src, void *dst, size_t len) { +#ifdef GRID_OMP +#pragma omp parallel + { + const char *c_src =(char *) src; + char *c_dest=(char *) dst; + int me,mywork,myoff; + GridThread::GetWorkBarrier(len,me, mywork,myoff); + bcopy(&c_src[myoff],&c_dest[myoff],mywork); + } +#else + bcopy(src,dst,len); +#endif + } + + +}; + +NAMESPACE_END(Grid); + diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 29cae060..a9fa13ea 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -28,101 +28,47 @@ Author: paboyle /* END LEGAL */ #pragma once +#ifndef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) +#define MIN(x,y) ((x)>(y)?(y):(x)) +#endif -// Introduce a class to gain deterministic bit reproducible reduction. -// make static; perhaps just a namespace is required. -NAMESPACE_BEGIN(Grid); +#define strong_inline __attribute__((always_inline)) inline +#define UNROLL _Pragma("unroll") -class GridThread { -public: - static int _threads; - static int _hyperthreads; - static int _cores; +////////////////////////////////////////////////////////////////////////////////// +// New primitives; explicit host thread calls, and accelerator data parallel calls +////////////////////////////////////////////////////////////////////////////////// + +#ifdef _OPENMP +#define GRID_OMP +#include +#endif - static void SetCores(int cr) { #ifdef GRID_OMP - _cores = cr; +#define DO_PRAGMA_(x) _Pragma (#x) +#define DO_PRAGMA(x) DO_PRAGMA_(x) +#define thread_num(a) omp_get_thread_num() +#define thread_max(a) omp_get_max_threads() #else - _cores = 1; +#define DO_PRAGMA_(x) +#define DO_PRAGMA(x) +#define thread_num(a) (0) +#define thread_max(a) (1) #endif - } - static void SetThreads(int thr) { -#ifdef GRID_OMP - _threads = MIN(thr,omp_get_max_threads()) ; - omp_set_num_threads(_threads); -#else - _threads = 1; -#endif - }; - static void SetMaxThreads(void) { -#ifdef GRID_OMP - _threads = omp_get_max_threads(); - omp_set_num_threads(_threads); -#else - _threads = 1; -#endif - }; - static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; }; - static int GetCores(void) { return _cores; }; - static int GetThreads(void) { return _threads; }; - static int SumArraySize(void) {return _threads;}; - static void GetWork(int nwork, int me, int & mywork, int & myoff){ - GetWork(nwork,me,mywork,myoff,_threads); - } - static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){ - int basework = nwork/units; - int backfill = units-(nwork%units); - if ( me >= units ) { - mywork = myoff = 0; - } else { - mywork = (nwork+me)/units; - myoff = basework * me; - if ( me > backfill ) - myoff+= (me-backfill); - } - return; - }; - - static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){ - me = ThreadBarrier(); - GetWork(nwork,me,mywork,myoff); - }; - - static int ThreadBarrier(void) { -#ifdef GRID_OMP -#pragma omp barrier - return omp_get_thread_num(); -#else - return 0; -#endif - }; - - template static void ThreadSum( std::vector &sum_array,obj &val,int me){ - sum_array[me] = val; - val=Zero(); - ThreadBarrier(); - for(int i=0;i<_threads;i++) val+= sum_array[i]; - ThreadBarrier(); - } - - static void bcopy(const void *src, void *dst, size_t len) { -#ifdef GRID_OMP -#pragma omp parallel - { - const char *c_src =(char *) src; - char *c_dest=(char *) dst; - int me,mywork,myoff; - GridThread::GetWorkBarrier(len,me, mywork,myoff); - bcopy(&c_src[myoff],&c_dest[myoff],mywork); - } -#else - bcopy(src,dst,len); -#endif - } - - -}; - -NAMESPACE_END(Grid); +#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i=0); assert(sz<=MaxEntries); +#endif _size = sz; } accelerator_inline void resize(size_type sz,const value &val) { - assert(sz>=0); - assert(sz<=MaxEntries); - _size = sz; + resize(sz); for(int s=0;s ©me) { diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 1b672141..6a84a2f2 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -73,8 +73,6 @@ feenableexcept (unsigned int excepts) } #endif -uint32_t gpu_threads=8; - NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// @@ -192,16 +190,12 @@ void GridParseLayout(char **argv,int argc, assert(ompthreads.size()==1); GridThread::SetThreads(ompthreads[0]); } - if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){ + if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){ std::vector gputhreads(0); -#ifndef GRID_NVCC - std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was" - << " not compiled with GPU support" << std::endl; -#endif - arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads"); + arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads"); GridCmdOptionIntVector(arg,gputhreads); assert(gputhreads.size()==1); - gpu_threads=gputhreads[0]; + acceleratorThreads(gputhreads[0]); } if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ @@ -241,8 +235,6 @@ static int Grid_is_initialised; ///////////////////////////////////////////////////////// void GridBanner(void) { - static int printed =0; - if( !printed ) { std::cout < still correct ? + +-------------------------------------------------------- + +* Fujitsu fcc + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" + + +* Fujitsu fcc w/ MPI + +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 1bb77aff..dc09549c 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -237,9 +237,9 @@ public: Vec rn ; random(sRNG,rn); - LatticeVec z(&Grid); z=rn; - LatticeVec x(&Grid); x=rn; - LatticeVec y(&Grid); y=rn; + LatticeVec z(&Grid); z=Zero(); + LatticeVec x(&Grid); x=Zero(); + LatticeVec y(&Grid); y=Zero(); double a=2.0; uint64_t Nloop=NLOOP; @@ -247,9 +247,9 @@ public: double start=usecond(); for(int i=0;i -#ifdef GRID_NVCC +#ifdef GRID_CUDA #define CUDA_PROFILE #endif @@ -77,7 +77,7 @@ int main (int argc, char ** argv) std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); - + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; @@ -107,8 +107,8 @@ int main (int argc, char ** argv) LatticeFermion err(FGrid); std::cout << GridLogMessage << "Drawing gauge field" << std::endl; - LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); + SU3::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; #if 0 Umu=1.0; @@ -126,11 +126,11 @@ int main (int argc, char ** argv) // Naive wilson implementation //////////////////////////////////// // replicate across fifth dimension - LatticeGaugeField Umu5d(FGrid); + LatticeGaugeField Umu5d(FGrid); std::vector U(4,FGrid); { - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); + autoView( Umu5d_v, Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;sBarrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + if(( norm2(err)>1.0e-4) ) { + /* std::cout << "RESULT\n " << result<Barrier(); exit(-1); } @@ -235,7 +249,7 @@ int main (int argc, char ** argv) } double t1=usecond(); FGrid->Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ +/* std::cout<< "DAG RESULT\n " <Barrier(); - + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* std::cout<< "Deo RESULT\n " < & latt4, int Ls, int threads,int report ) LatticeGaugeField Umu5d(FGrid); // replicate across fifth dimension - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); - for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s > &mat, for(int b=0;b > &mat, for(int b=0;b > &mat int ss= so+n*stride+b; for(int i=0;i > &m for(int i=0;i > &m // Trigger unroll for ( int m=0;m +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + + typedef typename ImprovedStaggeredFermionF::FermionField FermionField; + typename ImprovedStaggeredFermionF::ImplParams params; + + FermionField src (&Grid); random(pRNG,src); + FermionField result(&Grid); result=Zero(); + FermionField ref(&Grid); ref=Zero(); + FermionField tmp(&Grid); tmp=Zero(); + FermionField err(&Grid); tmp=Zero(); + LatticeGaugeFieldF Umu(&Grid); random(pRNG,Umu); + std::vector U(4,&Grid); + + double volume=1; + for(int mu=0;mu(Umu,U[nn],nn); + } +#endif + + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); + + std::cout< using namespace std; using namespace Grid; - ; template struct scal { @@ -51,6 +50,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -100,6 +100,8 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o,result_o_2); + MemoryManager::Print(); + LatticeFermionD diff_o(FrbGrid); RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); @@ -130,7 +132,9 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " CG checksums "<oSites();i++){ auto SE = gStencil.GetEntry(0,i); - auto check = Check.View(); - auto foo = Foo.View(); - + autoView(check, Check, CpuWrite); + autoView( foo, Foo, CpuRead); // Encapsulate in a general wrapper check[i] = foo[SE->_offset]; auto tmp=check[i]; if (SE->_permute & 0x1 ) { permute(check[i],tmp,0); tmp=check[i];} @@ -147,8 +146,8 @@ int main(int argc, char ** argv) }}}} if (nrm > 1.0e-4) { - auto check = Check.View(); - auto bar = Bar.View(); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i void sfunc(reduce &rr,scal &i1,scal &i2) con std::string name(void) const { return std::string("Reduce"); } }; -template +template void Tester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -172,6 +172,8 @@ void Tester(const functor &func) } if ( ok==0 ) { std::cout< +template void ReductionTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -278,12 +282,14 @@ void ReductionTester(const functor &func) } if ( ok==0 ) { std::cout< +template void IntReductionTester(const functor &func) { int Nsimd = vec::Nsimd(); @@ -323,6 +329,8 @@ void IntReductionTester(const functor &func) } if ( ok==0 ) { std::cout< void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz=in.size(); int msk = sz>>(n+1); for(int i=0;i void apply(ExtractBuffer &r1, ExtractBuffer &r2, ExtractBuffer &in1, - ExtractBuffer &in2) const - { + ExtractBuffer &in2) const + { int sz=in1.size(); int msk = sz>>(n+1); @@ -364,7 +372,7 @@ public: if ( (i&msk) == 0 ) { r2[i]=in1[j2];} else { r2[i]=in2[j2];} - } + } } std::string name(void) const { return std::string("Exchange"); } }; @@ -374,7 +382,7 @@ public: int n; funcRotate(int _n) { n=_n;}; template void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);} - template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { + template void apply(ExtractBuffer &rr,ExtractBuffer &in) const { int sz = in.size(); for(int i=0;i +template void PermTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -425,37 +433,39 @@ void PermTester(const functor &func) for(int i=0;i1.0e-7){ - std::cout< +template void ExchangeTester(const functor &func) { GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - + int Nsimd = vec::Nsimd(); ExtractBuffer input1(Nsimd); @@ -566,7 +576,7 @@ int main (int argc, char ** argv) std::cout << " Test {1,2,3,4} " << Test < seeds({1,2,3,4}); @@ -742,7 +752,7 @@ int main (int argc, char ** argv) for(int r=0;r(funcRotate(r)); } - + std::cout<_is_local && SE->_permute ) permute(check[i],foo[SE->_offset],permute_type); else if (SE->_is_local) @@ -151,8 +151,8 @@ int main(int argc, char ** argv) { }}}} if (nrm > 1.0e-4) { - auto check = Check.View(); - auto bar = Bar.View(); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(ocheck[i],efoo[SE->_offset],permute_type); else if (SE->_is_local) @@ -226,8 +226,8 @@ int main(int argc, char ** argv) { SE = OStencil.GetEntry(permute_type,0,i); // std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(echeck[i],ofoo[SE->_offset],permute_type); else if (SE->_is_local) diff --git a/tests/core/Test_cshift_red_black.cc b/tests/core/Test_cshift_red_black.cc index 34325072..4fdd5fc0 100644 --- a/tests/core/Test_cshift_red_black.cc +++ b/tests/core/Test_cshift_red_black.cc @@ -82,7 +82,7 @@ int main (int argc, char ** argv) pickCheckerboard(Odd,Uo,U); // std::cout<oSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s U(4,FGrid); for(int mu=0;mu +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + // pRNG.SeedFixedIntegers(std::vector({45,12,81,9}); + + typedef typename NaiveStaggeredFermionR::FermionField FermionField; + typedef typename NaiveStaggeredFermionR::ComplexField ComplexField; + typename NaiveStaggeredFermionR::ImplParams params; + + FermionField src (&Grid); random(pRNG,src); + FermionField result(&Grid); result=Zero(); + FermionField ref(&Grid); ref=Zero(); + FermionField tmp(&Grid); tmp=Zero(); + FermionField err(&Grid); tmp=Zero(); + FermionField phi (&Grid); random(pRNG,phi); + FermionField chi (&Grid); random(pRNG,chi); + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + std::vector U(4,&Grid); + + + double volume=1; + for(int mu=0;mu(Umu,mu); + /* Debug force unit + U[mu] = 1.0; + PokeIndex(Umu,U[mu],mu); + */ + } + + ref = Zero(); + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD u0=1.0; + + { // Simple improved staggered implementation + ref = Zero(); + RealD c1tad = 0.5*c1/u0; + + Lattice > coor(&Grid); + + Lattice > x(&Grid); LatticeCoordinate(x,0); + Lattice > y(&Grid); LatticeCoordinate(y,1); + Lattice > z(&Grid); LatticeCoordinate(z,2); + Lattice > t(&Grid); LatticeCoordinate(t,3); + + Lattice > lin_z(&Grid); lin_z=x+y; + Lattice > lin_t(&Grid); lin_t=x+y+z; + + for(int mu=0;mu * = < chi | Deo^dag| phi> "< HermOpEO(Ds); + HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); + HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + + HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); + HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + + pDce = innerProduct(phi_e,dchi_e); + pDco = innerProduct(phi_o,dchi_o); + cDpe = innerProduct(chi_e,dphi_e); + cDpo = innerProduct(chi_o,dphi_o); + + std::cout< U(4,FGrid); { - auto Umu5d_v = Umu5d.View(); - auto Umu_v = Umu.View(); + autoView( Umu5d_v , Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;soSites(),{ uint64_t ss= sss*Ls; typedef vSpinColourVector spinor; diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index 4eeb8c27..cb30faad 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -98,9 +98,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index 009f50b3..81a1b8c4 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -100,9 +100,9 @@ int main (int argc, char ** argv) // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 670e7589..0b0ba346 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -110,9 +110,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index d762e22a..b39fdd14 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -119,9 +119,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc index 66ae9dcf..58258a5e 100644 --- a/tests/forces/Test_dwf_gpforce_eofa.cc +++ b/tests/forces/Test_dwf_gpforce_eofa.cc @@ -114,9 +114,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc index c4e214bb..21f0b9d0 100644 --- a/tests/forces/Test_gp_plaq_force.cc +++ b/tests/forces/Test_gp_plaq_force.cc @@ -85,9 +85,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index 2573af6a..bb4ea6de 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto Uprime_v= Uprime.View(); - auto U_v = U.View(); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index 09a1dc4b..bdc332d9 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index cd30d898..1c85a5d9 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -99,9 +99,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index a1c4e930..11e69652 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index f71e2d41..f85501fa 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -112,9 +112,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 4975f36d..68163e63 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -115,9 +115,9 @@ int main (int argc, char** argv) SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); // fourth order exponential approx thread_foreach( i, mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt + mom_v[i](mu) *mom_v[i](mu) *U_v[i](mu)*(dt*dt/2.0) diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 3ea2c6aa..17dce530 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc index 9a78de24..ed72f2c0 100644 --- a/tests/forces/Test_rect_force.cc +++ b/tests/forces/Test_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index 47f1516a..c8b3a7f4 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(); - auto mom_v = mom.View(); - auto Uprime_v = Uprime.View(); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu); Uprime_v[i](mu) += mom_v[i](mu)*U_v[i](mu)*dt ; diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index d9ace23c..f26f0ac9 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -105,9 +105,9 @@ int main(int argc, char **argv) Hmom -= real(sum(trace(mommu * mommu))); PokeIndex(mom, mommu, mu); - auto Uprime_v = Uprime.View(); - auto U_v = U.View(); - auto mom_v = mom.View(); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(ss,mom_v, { Uprime_v[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom_v[ss]._internal[mu], dt, 12) * U_v[ss]._internal[mu]); diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index 2730885f..e24ae601 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -114,9 +114,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(); - auto U_v = U.View(); - auto Uprime_v = Uprime.View(); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc index 6fa90f32..3b8cdda6 100644 --- a/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonMixedRepresentationsFermionGauge.cc @@ -35,7 +35,7 @@ directory int main(int argc, char **argv) { -#ifndef GRID_NVCC +#ifndef GRID_CUDA using namespace Grid; diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc index f8477220..31697c12 100644 --- a/tests/hmc/Test_multishift_sqrt.cc +++ b/tests/hmc/Test_multishift_sqrt.cc @@ -31,7 +31,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; template class DumbOperator : public LinearOperatorBase { public: @@ -57,7 +56,7 @@ public: // Support for coarsening to a multigrid void OpDiag (const Field &in, Field &out) {}; void OpDir (const Field &in, Field &out,int dir,int disp){}; - void OpDirAll (const Field &in, std::vector &out) {}; // Abstract base + void OpDirAll (const Field &in, std::vector &out) {}; void Op (const Field &in, Field &out){ out = scale * in; @@ -105,7 +104,7 @@ int main (int argc, char ** argv) GridDefaultMpi()); double lo=0.001; - double hi=1.0; + double hi=20.0; int precision=64; int degree=10; AlgRemez remez(lo,hi,precision); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 873530ff..8e083231 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -1,5 +1,3 @@ - - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -302,8 +300,8 @@ int main (int argc, char ** argv) int nb=nbasisc/2; CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0); for(int n=0;noSites();site++){ subspace_g5[site](nn) = subspace[site](nn); diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc new file mode 100644 index 00000000..9e11c160 --- /dev/null +++ b/tests/solver/Test_dwf_multigrid.cc @@ -0,0 +1,594 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, + FineOperator &Fine, + FineSmoother &Smoother, + Guesser &Guess_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _CoarseOperator(Coarse), + _FineOperator(Fine), + _Smoother(Smoother), + _Guess(Guess_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + CoarseVector Csrc(_CoarseOperator.Grid()); + CoarseVector Csol(_CoarseOperator.Grid()); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0); + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + Gamma5R5HermitianLinearOperator HermIndefOpPV(Dpv); + + std::cout< CoarseBiCGSTAB(tol,MaxIt); + ConjugateGradient CoarseCG(tol,MaxIt); + // GeneralisedMinimalResidual CoarseGMRES(tol,MaxIt,20); + + BiCGSTAB FineBiCGSTAB(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + // GeneralisedMinimalResidual FineGMRES(tol,MaxIt,20); + + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + PVdagMLinearOperator FinePVdagM(Ddwf,Dpv);// M_{pv}^\dag M + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + SchurDiagOneOperator FineDiagOne(Ddwf); // 1 - M_ee^{-1} Meo Moo^{-1} Moe e + + MdagMLinearOperator CoarseMdagM(LDOp); + PVdagMLinearOperator CoarsePVdagM(LDOp,LDOpPV); + + std::cout< IRLCheby(0.03,12.0,71); // 1 iter + FunctionHermOp IRLOpCheby(IRLCheby,CoarseMdagM); + PlainHermOp IRLOp (CoarseMdagM); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + + std::cout< DeflCoarseGuesser(evec,eval); + NormalEquations DeflCoarseCGNE (LDOp,CoarseCG,DeflCoarseGuesser); + c_res=Zero(); + DeflCoarseCGNE(c_src,c_res); + + + std::cout< CoarseMgridCG(0.001,1000); + ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); + + typedef HDCRPreconditioner > TwoLevelHDCR; + TwoLevelHDCR TwoLevelPrecon(Aggregates, + HermIndefOp, + FineSmoother, + DeflCoarseCGNE); + TwoLevelPrecon.Level(1); + // PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + PrecGeneralisedConjugateResidualNonHermitian l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + + f_res=Zero(); + + CoarseCG.Tolerance=0.02; + l1PGCR(f_src,f_res); + + std::cout< CoarseMgridBiCGSTAB(0.01,1000); + BiCGSTAB FineMgridBiCGSTAB(0.0,24); + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser FineZeroGuesser; + + SolverWrapper FineBiCGSmoother( FinePVdagM, FineMgridBiCGSTAB, FineZeroGuesser); + SolverWrapper CoarsePVdagMSolver(CoarsePVdagM,CoarseMgridBiCGSTAB,CoarseZeroGuesser); + typedef HDCRPreconditioner > TwoLevelMG; + + TwoLevelMG _TwoLevelMG(Aggregates, + FinePVdagM, + FineBiCGSmoother, + CoarsePVdagMSolver); + _TwoLevelMG.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian pvPGCR(1.0e-8,100,FinePVdagM,_TwoLevelMG,16,16); + pvPGCR.Level(1); + + f_res=Zero(); + pvPGCR(f_src,f_res); + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +//#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; + +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +#define GridLogLevel std::cout << GridLogMessage < +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &Smoother, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _Smoother(Smoother), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _Aggregates.CoarseGrid; + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + double t; + // Fine Smoother + t=-usecond(); + _Smoother(in,out); + t+=usecond(); + GridLogLevel << "Smoother took "<< t/1000.0<< "ms" < block ({2,2,2,2}); + const int nbasis= 8; + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< SubspaceOp(Dw); + + Subspace Aggregates4D(Coarse4d,UGrid,0); + Subspace Aggregates5D(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + std::cout< Level1Op; + + NonHermitianLinearOperator LinOpDwf(Ddwf); + + Level1Op LDOp (*Coarse5d,0); + + std::cout< CoarseMdagM(LDOp); + BiCGSTAB CoarseBiCGSTAB(tol,MaxIt); + ConjugateGradient CoarseCG(tol,MaxIt); + + c_res=Zero(); + CoarseCG(CoarseMdagM,c_src,c_res); + + std::cout<