diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index b36d954f..eda6d9e7 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -3,26 +3,108 @@ NAMESPACE_BEGIN(Grid); -template -accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type) -{ - typedef decltype(coalescedRead(mp0)) sobj; - unsigned int Nsimd = vobj::Nsimd(); - unsigned int mask = Nsimd >> (type + 1); - int lane = acceleratorSIMTlane(Nsimd); - int j0 = lane &(~mask); // inner coor zero - int j1 = lane |(mask) ; // inner coor one - const vobj *vpa = &vp0; - const vobj *vpb = &vp1; - const vobj *vp = (lane&mask) ? (vpb) : (vpa); - auto sa = coalescedRead(vp[0],j0); - auto sb = coalescedRead(vp[0],j1); - coalescedWrite(mp0,sa); - coalescedWrite(mp1,sb); -} +class SimpleStencilParams{ +public: + Coordinate dirichlet; + int partialDirichlet; + SimpleStencilParams() { partialDirichlet = 0; }; +}; -template -class SimpleCompressor { + +// Compressors will inherit buffer management policies +// Standard comms buffer management +class FaceGatherSimple +{ +public: + static int PartialCompressionFactor(GridBase *grid) {return 1;}; + // Decompress is after merge so ok + template + static void Gather_plane_simple (commVector >& table, + const Lattice &rhs, + cobj *buffer, + compressor &compress, + int off,int so,int partial) + { + int num=table.size(); + std::pair *table_v = & table[0]; + + auto rhs_v = rhs.View(AcceleratorRead); + accelerator_forNB( i,num, vobj::Nsimd(), { + compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); + }); + rhs_v.ViewClose(); + } + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type,int partial) + { + assert( (table.size()&0x1)==0); + int num=table.size()/2; + int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane + + auto rhs_v = rhs.View(AcceleratorRead); + auto p0=&pointers[0][0]; + auto p1=&pointers[1][0]; + auto tp=&table[0]; + auto rhs_p = &rhs_v[0]; + accelerator_forNB(j, num, vobj::Nsimd(), { + compress.CompressExchange(p0[j],p1[j], + rhs_p[so+tp[2*j ].second], + rhs_p[so+tp[2*j+1].second], + type); + }); + rhs_v.ViewClose(); + } + + template + static void DecompressFace(decompressor decompress,Decompression &dd) + { + auto kp = dd.kernel_p; + auto mp = dd.mpi_p; + accelerator_forNB(o,dd.buffer_size,1,{ + decompress.Decompress(kp[o],mp[o]); + }); + } + template + static void MergeFace(decompressor decompress,Merger &mm) + { + auto mp = &mm.mpointer[0]; + auto vp0= &mm.vpointers[0][0]; + auto vp1= &mm.vpointers[1][0]; + auto type= mm.type; + accelerator_forNB(o,mm.buffer_size/2,vobj::Nsimd(),{ + decompress.Exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + }); + } +}; + +//////////////////////////////////// +// Wilson compressor will add alternate policies for Dirichlet +// and possibly partial Dirichlet for DWF +//////////////////////////////////// +/* +class FaceGatherDirichlet +{ + // If it's dirichlet we don't assemble comms buffers + // + // Rely on zeroes in gauge field to drive the correct result + // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute + template + static void Gather_plane_simple (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so){}; + template + static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + Vector pointers,int dimension,int plane,int cbmask, + compressor &compress,int type) {} + template + static void Merge(decompressor decompress,Merge &mm) { } + template + static void Decompress(decompressor decompress,Decompression &dd) {} +}; +*/ + +template +class SimpleCompressorGather : public FaceGather { public: void Point(int) {}; accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } @@ -30,20 +112,19 @@ public: accelerator_inline void Compress(vobj &buf,const vobj &in) const { coalescedWrite(buf,coalescedRead(in)); } - accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { + accelerator_inline void Exchange(vobj &mp0,vobj &mp1,vobj &vp0,vobj &vp1,Integer type) const { #ifdef GRID_SIMT - exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchangeSIMT(mp0,mp1,vp0,vp1,type); #else - exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); + exchange(mp0,mp1,vp0,vp1,type); #endif } - accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } - accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type) const { + accelerator_inline void Decompress(vobj &out,vobj &in) const { }; + accelerator_inline void CompressExchange(vobj &out0,vobj &out1,const vobj &in0,const vobj &in1,int type) const { #ifdef GRID_SIMT - exchangeSIMT(out0[j],out1[j],in[k],in[m],type); + exchangeSIMT(out0,out1,in0,in1,type); #else - exchange(out0[j],out1[j],in[k],in[m],type); + exchange(out0,out1,in0,in1,type); #endif } // For cshift. Cshift should drop compressor coupling altogether @@ -52,11 +133,18 @@ public: return arg; } }; -class SimpleStencilParams{ -public: - Coordinate dirichlet; - SimpleStencilParams() {}; -}; + +// Standard compressor never needs dirichlet. +// +// Get away with a local period wrap and rely on dirac operator to use a zero gauge link as it is faster +// +// Compressors that inherit Dirichlet and Non-dirichlet behaviour. +// +// Currently run-time behaviour through StencilParameters paramaters, p.dirichlet +// combined with the FaceGatherSimple behaviour + +template using SimpleCompressor = SimpleCompressorGather; +//template using SimpleCompressorDirichlet = SimpleCompressorGather; NAMESPACE_END(Grid);