From f1ed988aa30d4e437b03e090fc7e897f830d2697 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Jun 2025 05:04:12 +0200 Subject: [PATCH] Interface to reduced precision comms --- .../action/fermion/ImprovedStaggeredFermion.h | 6 + .../fermion/ImprovedStaggeredFermion5D.h | 6 + .../action/fermion/NaiveStaggeredFermion.h | 6 + Grid/qcd/action/fermion/WilsonCompressor.h | 206 +----------------- Grid/qcd/action/fermion/WilsonFermion.h | 6 + Grid/qcd/action/fermion/WilsonFermion5D.h | 9 +- 6 files changed, 33 insertions(+), 206 deletions(-) diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index f7655f24..e567c19f 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -154,6 +154,12 @@ public: StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 2641a6b8..ce65bfa3 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -179,6 +179,12 @@ public: StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 9ec6be90..53b18267 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -146,6 +146,12 @@ public: StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 22f2d8e3..458f2c83 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -32,209 +32,6 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -/////////////////////////////////////////////////////////////// -// Wilson compressor will need FaceGather policies for: -// Periodic, Dirichlet, and partial Dirichlet for DWF -/////////////////////////////////////////////////////////////// -const int dwf_compressor_depth=2; -#define DWF_COMPRESS -class FaceGatherPartialDWF -{ -public: -#ifdef DWF_COMPRESS - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; -#else - static int PartialCompressionFactor(GridBase *grid) { return 1;} -#endif - template - static void Gather_plane_simple (deviceVector >& table, - const Lattice &rhs, - cobj *buffer, - compressor &compress, - int off,int so,int partial) - { - //DWF only hack: If a direction that is OFF node we use Partial Dirichlet - // Shrinks local and remote comms buffers - GridBase *Grid = rhs.Grid(); - int Ls = Grid->_rdimensions[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth=Ls/2; -#endif - std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(AcceleratorRead); - int vol=table.size()/Ls; - accelerator_forNB( idx,table.size(), vobj::Nsimd(), { - Integer i=idx/Ls; - Integer s=idx%Ls; - Integer sc=depth+s-(Ls-depth); - if(s=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]); - }); - rhs_v.ViewClose(); - } - template - static void DecompressFace(decompressor decompress,Decompression &dd) - { - auto Ls = dd.dims[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth=Ls/2; -#endif - // Just pass in the Grid - auto kp = dd.kernel_p; - auto mp = dd.mpi_p; - int size= dd.buffer_size; - int vol= size/Ls; - accelerator_forNB(o,size,1,{ - int idx=o/Ls; - int s=o%Ls; - if ( s < depth ) { - int oo=s*vol+idx; - kp[o]=mp[oo]; - } else if ( s >= Ls-depth ) { - int sc = depth + s - (Ls-depth); - int oo=sc*vol+idx; - kp[o]=mp[oo]; - } else { - kp[o] = Zero();//fill rest with zero if partial dirichlet - } - }); - } - //////////////////////////////////////////////////////////////////////////////////////////// - // Need to gather *interior portions* for ALL s-slices in simd directions - // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side - // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. - //////////////////////////////////////////////////////////////////////////////////////////// - template - static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, - std::vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type,int partial) - { - GridBase *Grid = rhs.Grid(); - int Ls = Grid->_rdimensions[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth = Ls/2; -#endif - - // insertion of zeroes... - assert( (table.size()&0x1)==0); - int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - - auto rhs_v = rhs.View(AcceleratorRead); - auto p0=&pointers[0][0]; - auto p1=&pointers[1][0]; - auto tp=&table[0]; - int nnum=num/Ls; - accelerator_forNB(j, num, vobj::Nsimd(), { - // Reorders both local and remote comms buffers - // - int s = j % Ls; - int sp1 = (s+depth)%Ls; // peri incremented s slice - - int hxyz= j/Ls; - - int xyz0= hxyz*2; // xyzt part of coor - int xyz1= hxyz*2+1; - - int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... - - int kk0= xyz0*Ls + s ; // s=0 goes to s=1 - int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0 - compress.CompressExchange(p0[jj],p1[jj], - rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites - rhs_v[so+tp[kk1 ].second], - type); - }); - rhs_v.ViewClose(); - } - // Merge routine is for SIMD faces - template - static void MergeFace(decompressor decompress,Merger &mm) - { - auto Ls = mm.dims[0]; -#ifdef DWF_COMPRESS - int depth=dwf_compressor_depth; -#else - int depth = Ls/2; -#endif - int num= mm.buffer_size/2; // relate vol and Ls to buffer size - auto mp = &mm.mpointer[0]; - auto vp0= &mm.vpointers[0][0]; // First arg is exchange first - auto vp1= &mm.vpointers[1][0]; - auto type= mm.type; - int nnum = num/Ls; - accelerator_forNB(o,num,Merger::Nsimd,{ - - int s=o%Ls; - int hxyz=o/Ls; // xyzt related component - int xyz0=hxyz*2; - int xyz1=hxyz*2+1; - - int sp = (s+depth)%Ls; - int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice .... - - int oo0= s+xyz0*Ls; - int oo1= s+xyz1*Ls; - - // same ss0, ss1 pair goes to new layout - decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type); - }); - } -}; -class FaceGatherDWFMixedBCs -{ -public: -#ifdef DWF_COMPRESS - static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);}; -#else - static int PartialCompressionFactor(GridBase *grid) {return 1;} -#endif - - template - static void Gather_plane_simple (deviceVector >& table, - const Lattice &rhs, - cobj *buffer, - compressor &compress, - int off,int so,int partial) - { - // std::cout << " face gather simple DWF partial "< - static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, - std::vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type,int partial) - { - // std::cout << " face gather exch DWF partial "< - static void MergeFace(decompressor decompress,Merger &mm) - { - int partial = mm.partial; - // std::cout << " merge DWF partial "< - static void DecompressFace(decompressor decompress,Decompression &dd) - { - int partial = dd.partial; - // std::cout << " decompress DWF partial "< -class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs -// : public FaceGatherSimple +class WilsonCompressorTemplate : public FaceGatherSimple { public: diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 16320a93..41a2471e 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -165,6 +165,12 @@ public: StencilImpl Stencil; StencilImpl StencilEven; StencilImpl StencilOdd; + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Copy of the gauge field , with even and odd subsets DoubledGaugeField Umu; diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 9dadc866..ea0eaa32 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -204,7 +204,14 @@ public: DoubledGaugeField Umu; DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - + + + void SloppyComms(int sloppy) + { + Stencil.SetSloppyComms(sloppy); + StencilEven.SetSloppyComms(sloppy); + StencilOdd.SetSloppyComms(sloppy); + } // Comms buffer // std::vector > comm_buf;