diff --git a/Grid/qcd/action/ActionParams.h b/Grid/qcd/action/ActionParams.h index 274ff318..b2a06280 100644 --- a/Grid/qcd/action/ActionParams.h +++ b/Grid/qcd/action/ActionParams.h @@ -39,29 +39,38 @@ struct GparityWilsonImplParams { Coordinate twists; //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs Coordinate dirichlet; // Blocksize of dirichlet BCs - GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); }; + int partialDirichlet; + GparityWilsonImplParams() : twists(Nd, 0) { + dirichlet.resize(0); + partialDirichlet=0; + }; }; struct WilsonImplParams { bool overlapCommsCompute; Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; AcceleratorVector twist_n_2pi_L; AcceleratorVector boundary_phases; WilsonImplParams() { dirichlet.resize(0); + partialDirichlet=0; boundary_phases.resize(Nd, 1.0); twist_n_2pi_L.resize(Nd, 0.0); }; WilsonImplParams(const AcceleratorVector phi) : boundary_phases(phi), overlapCommsCompute(false) { twist_n_2pi_L.resize(Nd, 0.0); + partialDirichlet=0; dirichlet.resize(0); } }; struct StaggeredImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs + int partialDirichlet; StaggeredImplParams() { + partialDirichlet=0; dirichlet.resize(0); }; }; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index de2f1979..e2ced552 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -32,17 +32,182 @@ Author: paboyle NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////////////// +// Wilson compressor will need FaceGather policies for: +// Periodic, Dirichlet, and partial Dirichlet for DWF +/////////////////////////////////////////////////////////////// +class FaceGatherPartialDWF +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + // static int PartialCompressionFactor(GridBase *grid) { return 1;} + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + //DWF only hack: If a direction that is OFF node we use Partial Dirichlet + // Shrinks local and remote comms buffers + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; + std::pair *table_v = & table[0]; + auto rhs_v = rhs.View(AcceleratorRead); + int vol=table.size()/Ls; + accelerator_forNB( idx,table.size(), vobj::Nsimd(), { + Integer i=idx/Ls; + Integer s=idx%Ls; + if(s==0) compress.Compress(buffer[off+i ],rhs_v[so+table_v[idx].second]); + if(s==Ls-1) compress.Compress(buffer[off+i+vol],rhs_v[so+table_v[idx].second]); + }); + } + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + auto Ls = dd.dims[0]; + // Just pass in the Grid + auto kp = dd.kernel_p; + auto mp = dd.mpi_p; + int size= dd.buffer_size; + int vol= size/Ls; + accelerator_forNB(o,size,1,{ + int idx=o/Ls; + int s=o%Ls; + if ( s == 0 ) { + int oo=idx; + kp[o]=mp[oo]; + } else if ( s == Ls-1 ) { + int oo=vol+idx; + kp[o]=mp[oo]; + } else { + kp[o] = Zero();//fill rest with zero if partial dirichlet + } + }); + } + //////////////////////////////////////////////////////////////////////////////////////////// + // Need to gather *interior portions* for ALL s-slices in simd directions + // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side + // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. + //////////////////////////////////////////////////////////////////////////////////////////// + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + GridBase *Grid = rhs.Grid(); + int Ls = Grid->_rdimensions[0]; + + // insertion of zeroes... + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + auto rhs_v = rhs.View(AcceleratorRead); + auto p0=&pointers[0][0]; + auto p1=&pointers[1][0]; + auto tp=&table[0]; + int nnum=num/Ls; + accelerator_forNB(j, num, vobj::Nsimd(), { + // Reorders both local and remote comms buffers + // + int s = j % Ls; + int sp1 = (s+1)%Ls; // peri incremented s slice + + int hxyz= j/Ls; + + int xyz0= hxyz*2; // xyzt part of coor + int xyz1= hxyz*2+1; + + int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int kk0= xyz0*Ls + s ; // s=0 goes to s=1 + int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0 + compress.CompressExchange(p0[jj],p1[jj], + rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites + rhs_v[so+tp[kk1 ].second], + type); + }); + rhs_v.ViewClose(); + } + // Merge routine is for SIMD faces + template + static void MergeFace(decompressor decompress,Merger &mm) + { + auto Ls = mm.dims[0]; + int num= mm.buffer_size/2; // relate vol and Ls to buffer size + auto mp = &mm.mpointer[0]; + auto vp0= &mm.vpointers[0][0]; // First arg is exchange first + auto vp1= &mm.vpointers[1][0]; + auto type= mm.type; + int nnum = num/Ls; + accelerator_forNB(o,num,vobj::Nsimd(),{ + + int s=o%Ls; + int hxyz=o/Ls; // xyzt related component + int xyz0=hxyz*2; + int xyz1=hxyz*2+1; + + int sp = (s+1)%Ls; + int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... + + int oo0= s+xyz0*Ls; + int oo1= s+xyz1*Ls; + + // same ss0, ss1 pair goes to new layout + decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type); + }); + } +}; +class FaceGatherDWFMixedBCs +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/2;}; + + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); + else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); + } + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial); + else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial); + } + template + static void MergeFace(decompressor decompress,Merger &mm) + { + int partial = mm.partial; + if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm); + else FaceGatherSimple::MergeFace(decompress,mm); + } + + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + int partial = dd.partial; + if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd); + else FaceGatherSimple::DecompressFace(decompress,dd); + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////// -// optimised versions supporting half precision too +// optimised versions supporting half precision too??? Deprecate ///////////////////////////////////////////////////////////////////////////////////////////// -template -class WilsonCompressorTemplate; - +//Could make FaceGather a template param, but then behaviour is runtime not compile time template -class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, - typename std::enable_if::value>::type > +class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs +// : public FaceGatherSimple { public: @@ -79,172 +244,81 @@ public: /*****************************************************/ /* Exchange includes precision change if mpi data is not same */ /*****************************************************/ - accelerator_inline void Exchange(SiteHalfSpinor *mp, - const SiteHalfSpinor * __restrict__ vp0, - const SiteHalfSpinor * __restrict__ vp1, - Integer type,Integer o) const { + accelerator_inline void Exchange(SiteHalfSpinor &mp0, + SiteHalfSpinor &mp1, + const SiteHalfSpinor & vp0, + const SiteHalfSpinor & vp1, + Integer type) const { #ifdef GRID_SIMT - exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchangeSIMT(mp0,mp1,vp0,vp1,type); #else SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; - exchange(tmp1,tmp2,vp0[o],vp1[o],type); - vstream(mp[2*o ],tmp1); - vstream(mp[2*o+1],tmp2); + exchange(tmp1,tmp2,vp0,vp1,type); + vstream(mp0,tmp1); + vstream(mp1,tmp2); #endif } - + /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, - SiteHalfSpinor * __restrict__ in, Integer o) const { - assert(0); + accelerator_inline void Decompress(SiteHalfSpinor &out, + SiteHalfSpinor &in) const { + out = in; } /*****************************************************/ /* Compress Exchange */ /*****************************************************/ - accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, - SiteHalfSpinor * __restrict__ out1, - const SiteSpinor * __restrict__ in, - Integer j,Integer k, Integer m,Integer type) const + accelerator_inline void CompressExchange(SiteHalfSpinor &out0, + SiteHalfSpinor &out1, + const SiteSpinor &in0, + const SiteSpinor &in1, + Integer type) const { #ifdef GRID_SIMT typedef SiteSpinor vobj; typedef SiteHalfSpinor hvobj; - typedef decltype(coalescedRead(*in)) sobj; - typedef decltype(coalescedRead(*out0)) hsobj; + typedef decltype(coalescedRead(in0)) sobj; + typedef decltype(coalescedRead(out0)) hsobj; unsigned int Nsimd = vobj::Nsimd(); unsigned int mask = Nsimd >> (type + 1); int lane = acceleratorSIMTlane(Nsimd); int j0 = lane &(~mask); // inner coor zero int j1 = lane |(mask) ; // inner coor one - const vobj *vp0 = &in[k]; - const vobj *vp1 = &in[m]; + const vobj *vp0 = &in0; + const vobj *vp1 = &in1; const vobj *vp = (lane&mask) ? vp1:vp0; auto sa = coalescedRead(*vp,j0); auto sb = coalescedRead(*vp,j1); hsobj psa, psb; projector::Proj(psa,sa,mu,dag); projector::Proj(psb,sb,mu,dag); - coalescedWrite(out0[j],psa); - coalescedWrite(out1[j],psb); + coalescedWrite(out0,psa); + coalescedWrite(out1,psb); #else SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; - projector::Proj(temp1,in[k],mu,dag); - projector::Proj(temp2,in[m],mu,dag); + projector::Proj(temp1,in0,mu,dag); + projector::Proj(temp2,in1,mu,dag); exchange(temp3,temp4,temp1,temp2,type); - vstream(out0[j],temp3); - vstream(out1[j],temp4); + vstream(out0,temp3); + vstream(out1,temp4); #endif } /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) const { return false; } + accelerator_inline bool DecompressionStep(void) const { + return false; + } }; -#if 0 -template -class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, - typename std::enable_if::value>::type > -{ -public: - - int mu,dag; - - void Point(int p) { mu=p; }; - - WilsonCompressorTemplate(int _dag=0){ - dag = _dag; - } - - typedef _Spinor SiteSpinor; - typedef _Hspinor SiteHalfSpinor; - typedef _HCspinor SiteHalfCommSpinor; - typedef typename SiteHalfCommSpinor::vector_type vComplexLow; - typedef typename SiteHalfSpinor::vector_type vComplexHigh; - constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - - accelerator_inline int CommDatumSize(void) const { - return sizeof(SiteHalfCommSpinor); - } - - /*****************************************************/ - /* Compress includes precision change if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { - SiteHalfSpinor hsp; - SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; - projector::Proj(hsp,in,mu,dag); - precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); - } - accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { -#ifdef GRID_SIMT - typedef decltype(coalescedRead(buf)) sobj; - sobj sp; - auto sin = coalescedRead(in); - projector::Proj(sp,sin,mu,dag); - coalescedWrite(buf,sp); -#else - projector::Proj(buf,in,mu,dag); -#endif - } - - /*****************************************************/ - /* Exchange includes precision change if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Exchange(SiteHalfSpinor *mp, - SiteHalfSpinor *vp0, - SiteHalfSpinor *vp1, - Integer type,Integer o) const { - SiteHalfSpinor vt0,vt1; - SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; - SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; - precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw); - precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw); - exchange(mp[2*o],mp[2*o+1],vt0,vt1,type); - } - - /*****************************************************/ - /* Have a decompression step if mpi data is not same */ - /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { - SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; - precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); - } - - /*****************************************************/ - /* Compress Exchange */ - /*****************************************************/ - accelerator_inline void CompressExchange(SiteHalfSpinor *out0, - SiteHalfSpinor *out1, - const SiteSpinor *in, - Integer j,Integer k, Integer m,Integer type) const { - SiteHalfSpinor temp1, temp2,temp3,temp4; - SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; - SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; - projector::Proj(temp1,in[k],mu,dag); - projector::Proj(temp2,in[m],mu,dag); - exchange(temp3,temp4,temp1,temp2,type); - precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw); - precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw); - } - - /*****************************************************/ - /* Pass the info to the stencil */ - /*****************************************************/ - accelerator_inline bool DecompressionStep(void) const { return true; } - -}; -#endif - #define DECLARE_PROJ(Projector,Compressor,spProj) \ class Projector { \ public: \ diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 388094b2..992f4d20 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -148,12 +148,21 @@ void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) GaugeField HUmu(_Umu.Grid()); HUmu = _Umu*(-0.5); if ( Dirichlet ) { - std::cout << GridLogDslash << " Dirichlet BCs 5d " <LocalDimensions()[d]; + if (GaugeBlock) assert( (GaugeBlock%ldim)==0); + } + } + if ( Dirichlet && (!this->Params.partialDirichlet) ) { + std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " < Filter(GaugeBlock); Filter.applyFilter(HUmu); + } else { + std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " not filtered gauge field" <