Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet

2026-07-17 15:43:27 +01:00 · 2023-03-21 16:04:27 -04:00
parent 8b43be39c0 e1c326558a
commit 7db8dd7a95
19 changed files with 377 additions and 93 deletions
@@ -81,6 +81,7 @@ public:
  using OperatorFunction<FieldD>::operator();

  RealD   Tolerance;
+  Integer MaxIterationsMshift;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  std::vector<int> IterationsToCompleteShift;  // Iterations for this shift
@@ -95,9 +96,9 @@ public:

  ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
 				       GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
-				       int _ReliableUpdateFreq
-				       ) : 
-    MaxIterations(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
+				       int _ReliableUpdateFreq) : 
+    MaxIterationsMshift(maxit),  shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
+    MaxIterations(20000)
  { 
    verbose=1;
    IterationsToCompleteShift.resize(_shifts.order);
@@ -244,7 +245,7 @@ public:
    // Iteration loop
    int k;
  
-    for (k=1;k<=MaxIterations;k++){    
+    for (k=1;k<=MaxIterationsMshift;k++){    

      a = c /cp;
      AXPYTimer.Start();
@@ -350,12 +351,17 @@ public:
 	}
      }

-      if ( all_converged ){
+      if ( all_converged || k == MaxIterationsMshift-1){

 	SolverTimer.Stop();
-	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
-	std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
-      
+
+	if ( all_converged ){
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
+	} else {
+	  std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
+	}
+	
 	// Check answers 
 	for(int s=0; s < nshift; s++) { 
 	  Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@@ -396,12 +402,10 @@ public:

 	return;
      }
-
   
    }
-    // ugly hack
    std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-    //  assert(0);
+    assert(0);
  }

 };
@@ -29,6 +29,7 @@ Author: Christoph Lehner <christoph@lhnr.de>

 #include <Grid/GridCore.h>
 #include <pwd.h>
+#include <syscall.h>

 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
@@ -484,24 +484,26 @@ public:

    int dag = compress.dag;
    int face_idx=0;
+#define vet_same_node(a,b) \
+      { auto tmp = b;  }
    if ( dag ) { 
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
    } else {
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
    }
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
@@ -52,13 +52,6 @@ public:
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:

-#ifdef GRID_SYCL
-#define SYCL_HACK
-#endif  
-#ifdef SYCL_HACK
-  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
-			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
-#endif
  
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
+  
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
@@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
@@ -433,11 +433,23 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    });									

 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
+    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
  });
+#define ASM_CALL_SLICE(A)						\
+  auto grid = in.Grid() ;						\
+  int nt = grid->LocalDimensions()[4];					\
+  int nxyz = Nsite/nt ;							\
+  for(int t=0;t<nt;t++){						\
+  thread_for( sss, nxyz, {						\
+    int ss = t*nxyz+sss;						\
+    int sU = ss;							\
+    int sF = ss*Ls;							\
+    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
+    });}

 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid);
 	  ApproxNegPowerAction.tolerances[i]    = action_tolerance[i];
 	  ApproxHalfPowerAction.tolerances[i]   = action_tolerance[i];
 	  ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
+	}
+	for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
 	  ApproxPowerMD.tolerances[i]       = md_tolerance[i];
 	  ApproxNegPowerMD.tolerances[i]    = md_tolerance[i];
 	  ApproxHalfPowerMD.tolerances[i]   = md_tolerance[i];
@@ -29,6 +29,8 @@
 #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
 #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H

+#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
+
 NAMESPACE_BEGIN(Grid);

    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -58,7 +60,7 @@ NAMESPACE_BEGIN(Grid);
      //Allow derived classes to override the multishift CG
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
 #if 0
-	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOp : DenOp);
+	SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
 	ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
 	msCG(schurOp,in, out);
 #else
@@ -66,7 +68,8 @@ NAMESPACE_BEGIN(Grid);
 	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
-	
+
+	// Action better with higher precision?
 	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
@@ -76,12 +79,12 @@ NAMESPACE_BEGIN(Grid);
      }
      virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
 	SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
-	SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
+	SchurDifferentiableOperator<ImplF>  schurOpF (numerator ? NumOpF  : DenOpF);

 	FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
 	FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
 	std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
-	ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
+	ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
 	precisionChange(inD2,in);
 	std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
 	msCG(schurOpD2, inD2, out_elemsD2, outD2);
@@ -300,9 +300,9 @@ public:

 protected:
  GridBase *                        _grid;
-
 public:
  GridBase *Grid(void) const { return _grid; }
+  LebesgueOrder *lo;

  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
@@ -348,6 +348,7 @@ public:
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
+#ifdef SHM_FAST_PATH
  inline int SameNode(int point) {

    int dimension    = this->_directions[point];
@@ -367,7 +368,40 @@ public:
    if ( displacement == 0 ) return 1;
    return 0;
  }
+#else
+  // fancy calculation for shm code
+  inline int SameNode(int point) {

+    int dimension    = this->_directions[point];
+    int displacement = this->_distances[point];
+
+    int pd              = _grid->_processors[dimension];
+    int fd              = _grid->_fdimensions[dimension];
+    int ld              = _grid->_ldimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+ 
+    int recv_from_rank;
+    int xmit_to_rank;
+
+    if ( ! comm_dim ) return 1;
+
+    int nbr_proc;
+    if (displacement>0) nbr_proc = 1;
+    else                nbr_proc = pd-1;
+
+    // FIXME  this logic needs to be sorted for three link term
+    //    assert( (displacement==1) || (displacement==-1));
+    // Present hack only works for >= 4^4 subvol per node
+    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
+
+    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
+
+    if ( shm==NULL ) return 0;
+    return 1;
+  }
+#endif
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  // Use OpenMP Tasks for cleaner ???
@@ -1075,7 +1109,7 @@ public:
    int comms_recv   = this->_comms_recv[point];
    int comms_partial_send   = this->_comms_partial_send[point] ;
    int comms_partial_recv   = this->_comms_partial_recv[point] ;
-
+    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());

@@ -1146,11 +1180,32 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}

+	// potential SHM fast path for intranode
+	int shm_send=0;
+	int shm_recv=0;
+#ifdef SHM_FAST_PATH
+	// Put directly in place if we can
+	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
+	if ( (send_buf==NULL) ) {
+	  shm_send=0;
+	  send_buf = this->u_send_buf_p;
+	} else {
+	  shm_send=1;
+	}
+	void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
+	if ( test_ptr != NULL ) shm_recv = 1;
+	//	static int printed;
+	//	if (!printed){
+	  //	  std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	//	  printed = 1;
+	//	}
+#else
 	////////////////////////////////////////////////////////
 	// Gather locally
 	////////////////////////////////////////////////////////
 	send_buf = this->u_send_buf_p; // Gather locally, must send
 	assert(send_buf!=NULL);
+#endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
@@ -1162,10 +1217,13 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
+	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
-		    xmit_to_rank, comms_send|comms_partial_send,
-		    recv_from_rank, comms_recv|comms_partial_recv,
+		    xmit_to_rank, do_send,
+		    recv_from_rank, do_recv,
 		    xbytes,rbytes);
 	}

@@ -1307,19 +1365,47 @@ public:

 	    int recv_from_rank;
 	    int xmit_to_rank;
-
+	    int shm_send=0;
+	    int shm_recv=0;
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
-
+#ifdef SHM_FAST_PATH
+  #warning STENCIL SHM FAST PATH SELECTED
+	    // shm == receive pointer         if offnode
+	    // shm == Translate[send pointer] if on node -- my view of his send pointer
+	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if (shm==NULL) {
+	      shm = rp;
+	      // we found a packet that comes from MPI and contributes to this shift.
+	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
+	      // Kernel will add the exterior_terms except if is_same_node.
+	      // leg of stencil
+	      shm_recv=0;
+	    } else {
+	      shm_recv=1;
+	    }
+	    rpointers[i] = shm;
+	    // Test send side
+	    void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
+	    if ( test_ptr != NULL ) shm_send = 1;
+	    //	    static int printed;
+	    //	    if (!printed){
+	    //	      std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	    //	      printed = 1;
+	    //	    }
+#else
 	    rpointers[i] = rp;
+#endif
 	    
 	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
 	    if ( !duplicate  ) { 
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
+	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	      AddPacket((void *)sp,(void *)rp,
-			xmit_to_rank,comms_send|comms_partial_send,
-			recv_from_rank,comms_recv|comms_partial_recv,
+			xmit_to_rank,do_send,
+			recv_from_rank,do_send,
 			xbytes,rbytes);
 	    }

@@ -1329,7 +1415,7 @@ public:

 	  }
 	}
-
+	// rpointer may be doing a remote read in the gather over SHM
 	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
@@ -248,17 +248,23 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 // SyCL acceleration
 //////////////////////////////////////////////
-#ifdef GRID_SYCL
-NAMESPACE_END(Grid);
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>

+#ifdef GRID_SYCL
 #define GRID_SYCL_LEVEL_ZERO_IPC

-#ifdef GRID_SYCL_LEVEL_ZERO_IPC
+NAMESPACE_END(Grid);
+#if 0
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
+#else
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
+
 NAMESPACE_BEGIN(Grid);

 extern cl::sycl::queue *theGridAccelerator;
@@ -232,31 +232,34 @@ int main(int argc, char **argv) {
  //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

-  OneFlavourRationalParams OFRp; // Up/down
-  OFRp.lo       = 4.0e-5;
+  int SP_iters=10000;
+  
+  RationalActionParams OFRp; // Up/down
+  OFRp.lo       = 6.0e-5;
  OFRp.hi       = 90.0;
-  OFRp.MaxIter  = 60000;
-  OFRp.tolerance= 1.0e-5;
-  OFRp.mdtolerance= 1.0e-3;
+  OFRp.inv_pow  = 2;
+  OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space
+  OFRp.action_tolerance= 1.0e-8;
+  OFRp.action_degree   = 18;
+  OFRp.md_tolerance= 1.0e-5;
+  OFRp.md_degree   = 14;
  //  OFRp.degree   = 20; converges
  //  OFRp.degree   = 16;
-  OFRp.degree   = 18;
  OFRp.precision= 80;
  OFRp.BoundsCheckFreq=0;
  std::vector<RealD> ActionTolByPole({
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
+      1.0e-7,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });
  std::vector<RealD> MDTolByPole({
-      1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more
+      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
      //      1.0e-6,3.0e-7,1.0e-7,1.0e-7,
      //      3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
-      1.0e-8,1.0e-8,1.0e-8,1.0e-8,
      1.0e-8,1.0e-8
    });

@@ -340,6 +343,7 @@ int main(int argc, char **argv) {
  ParamsDirF.dirichlet=Dirichlet;
  ParamsDir.partialDirichlet=1;
  ParamsDirF.partialDirichlet=1;
+  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -457,9 +461,9 @@ int main(int argc, char **argv) {
  
 #define MIXED_PRECISION
 #ifdef MIXED_PRECISION
-  std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
+  std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
 #else
-  std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
+  std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
 #endif

  typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
@@ -544,19 +548,19 @@ int main(int argc, char **argv) {
      ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet;
      NumeratorsD2.push_back  (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2));
    
-      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
+      Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
 			   *Numerators[h],*Denominators[h],
 			   *NumeratorsF[h],*DenominatorsF[h],
 			   *NumeratorsD2[h],*DenominatorsD2[h],
-			   OFRp, 400) );
-      Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
+			   OFRp, SP_iters) );
+      Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
 			   *Numerators[h],*Denominators[h],
 			   *NumeratorsF[h],*DenominatorsF[h],
 			   *NumeratorsD2[h],*DenominatorsD2[h],
-			   OFRp, 400) );
+			   OFRp, SP_iters) );
 #else
-      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
-      Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
+      Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
 #endif
    }
  }
@@ -183,7 +183,7 @@ int main(int argc, char **argv) {
  // 4/2 => 0.6 dH
  // 3/3 => 0.8 dH .. depth 3, slower
  //MD.MDsteps =  4;
-  MD.MDsteps =  3;
+  MD.MDsteps =  12;
  MD.trajL   = 0.5;

  HMCparameters HMCparams;
@@ -200,8 +200,8 @@ int main(int argc, char **argv) {
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition

  CheckpointerParameters CPparams;
-  CPparams.config_prefix = "ckpoint_DDHMC_lat";
-  CPparams.rng_prefix    = "ckpoint_DDHMC_rng";
+  CPparams.config_prefix = "ckpoint_HMC_lat";
+  CPparams.rng_prefix    = "ckpoint_HMC_rng";
  CPparams.saveInterval  = 1;
  CPparams.format        = "IEEE64BIG";
  TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@@ -228,7 +228,7 @@ int main(int argc, char **argv) {
  Real pv_mass      = 1.0;
  //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
  //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
-  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
+  std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
  //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });

  auto GridPtr   = TheHMC.Resources.GetCartesian();
@@ -299,8 +299,8 @@ int main(int argc, char **argv) {
  ////////////////////////////////////
  // Collect actions
  ////////////////////////////////////
-  ActionLevel<HMCWrapper::Field> Level1(1);
-  ActionLevel<HMCWrapper::Field> Level2(3);
+  //  ActionLevel<HMCWrapper::Field> Level1(1);
+  ActionLevel<HMCWrapper::Field> Level2(1);
  ActionLevel<HMCWrapper::Field> Level3(15);

  ////////////////////////////////////
@@ -369,7 +369,7 @@ int main(int argc, char **argv) {
 	 ActionCGL, ActionCGR,
 	 DerivativeCGL, DerivativeCGR,
 	 SFRp, true);
-  //  Level2.push_back(&EOFA);
+  Level2.push_back(&EOFA);

  ////////////////////////////////////
  // up down action
@@ -477,7 +477,7 @@ int main(int argc, char **argv) {
  // Gauge action
  /////////////////////////////////////////////////////////////
  Level3.push_back(&GaugeAction);
-  TheHMC.TheAction.push_back(Level1);
+  //  TheHMC.TheAction.push_back(Level1);
  TheHMC.TheAction.push_back(Level2);
  TheHMC.TheAction.push_back(Level3);
  std::cout << GridLogMessage << " Action complete "<< std::endl;
@@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in
      ;;
     *) ;;
 esac
+############### force MPI in SMP
+AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no])
+case ${ac_SHM_FAST_PATH} in
+     yes)
+        AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] )
+      ;;
+     *) ;;
+esac

 ############### communication type selection
 AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
@@ -23,12 +23,7 @@ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0

-for i in 0 
-do
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
-done
-#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
-#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
+#mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log

+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

@@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK


-if [ $MPI_LOCALRANKID = "0" ] 
-then
-#  ~psteinbr/build_pti/ze_tracer -h $@
-  onetrace --chrome-device-timeline $@
-else
+#if [ $MPI_LOCALRANKID = "0" ] 
+#then
+#  ~psteinbr/build_pti/ze_tracer -c $@
+#  onetrace --chrome-kernel-timeline $@
+#else
  $@
-fi
+#fi
@@ -1,4 +1,4 @@
-INSTALL=/nfs/site/home/azusayax/install
+INSTALL=/nfs/site/home/paboylx/prereqs/
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
@@ -1,5 +1,6 @@
 export https_proxy=http://proxy-chain.intel.com:911
-export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH

 module load intel-release
 source /opt/intel/oneapi/PVC_setup.sh
@@ -0,0 +1,160 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBGRID(&GRID);
+
+  LatticeComplexD     one(&GRID);
+  LatticeComplexD      zz(&GRID);
+  LatticeComplexD       C(&GRID);
+  LatticeComplexD  Ctilde(&GRID);
+  LatticeComplexD  Cref  (&GRID);
+  LatticeComplexD  Csav  (&GRID);
+  LatticeComplexD    coor(&GRID);
+
+  LatticeSpinMatrixD    S(&GRID);
+  LatticeSpinMatrixD    Stilde(&GRID);
+  
+  Coordinate p({1,3,2,3});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
+  GridParallelRNG          pRNG(&GRID);
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeFieldD Umu(&GRID);
+
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
+
+  ////////////////////////////////////////////////////
+  // Wilson test
+  ////////////////////////////////////////////////////
+  {
+    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    
+    RealD mass=0.01;
+    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
+    
+    Dw.M(src,tmp);
+
+    std::cout << "Dw src = " <<norm2(src)<<std::endl;
+    std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
+    
+    Dw.FreePropagator(tmp,ref,mass);
+
+    std::cout << "Dw ref = " <<norm2(ref)<<std::endl;
+    
+    ref = ref - src;
+    
+    std::cout << "Dw ref-src = " <<norm2(ref)<<std::endl;
+  }
+
+
+  ////////////////////////////////////////////////////
+  // Wilson prop
+  ////////////////////////////////////////////////////
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Wilson Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    src=Zero();
+    Coordinate point(4,0); // 0,0,0,0
+    SpinColourVectorD ferm;
+    ferm=Zero();
+    ferm()(0)(0) = ComplexD(1.0);
+    pokeSite(ferm,src,point);
+
+    RealD mass=0.01;
+    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    Dw.FreePropagator(src,ref,mass) ;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    result(&GRID); 
+    const int sdir=0;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dw.Mdag(src,tmp);
+    src=tmp;
+    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
+    CG(HermOp,src,result);
+    
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Taking difference" <<std::endl;
+    std::cout << "Dw result "<<norm2(result)<<std::endl;
+    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
+    
+    diff = ref - result;
+    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+
+    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
+  }
+
+  
+  Grid_finalize();
+}