Merge pull request #1 from paboyle/develop

Update to Grid's main repo
2026-08-01 00:13:28 +01:00 · 2022-12-13 09:12:57 -05:00
parent 477ebf24f4 07acfe89f2
commit 40ee605591
9 changed files with 270 additions and 216 deletions
@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
+#if defined(GRID_SYCL)
+#include <Grid/lattice/Lattice_reduction_sycl.h>
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sum_gpu(arg,osites);
 #else
  return sum_cpu(arg,osites);
@@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  return sumD_gpu_large(arg,osites);
 #else
  return sumD_cpu(arg,osites);
@@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
-  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu(&arg_v[0],osites);
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+  typename vobj::scalar_object ssum;
+  autoView( arg_v, arg, AcceleratorRead);
+  ssum= sum_gpu(&arg_v[0],osites);
 #else
  autoView(arg_v, arg, CpuRead);
-  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
  arg.Grid()->GlobalSum(ssum);
@@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
  autoView( arg_v, arg, AcceleratorRead);
  Integer osites = arg.Grid()->oSites();
  auto ssum= sum_gpu_large(&arg_v[0],osites);
@@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
  Vector<inner_t> inner_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
-    
  {
    autoView( left_v , left, AcceleratorRead);
    autoView( right_v,right, AcceleratorRead);
-
+    // This code could read coalesce
    // GPU - SIMT lane compliance...
    accelerator_for( ss, sites, 1,{
 	auto x_l = left_v[ss];
@@ -0,0 +1,125 @@
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Possibly promote to double and sum
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_objectD sobjD;
+  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
+  sobj identity; zeroit(identity);
+  sobj ret ; 
+
+  Integer nsimd= vobj::Nsimd();
+  
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
+     cgh.parallel_for(cl::sycl::range<1>{osites},
+		      Reduction,
+		      [=] (cl::sycl::id<1> item, auto &sum) {
+      auto osite   = item[0];
+      sum +=Reduce(lat[osite]);
+     });
+   });
+  theGridAccelerator->wait();
+  ret = mysum[0];
+  free(mysum,*theGridAccelerator);
+  sobjD dret; convertType(dret,ret);
+  return dret;
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_tensor(lat,osites);
+}
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_large(lat,osites);
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_large(lat,osites);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Return as same precision as input performing reduction in double precision though
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu(lat,osites);
+  return result;
+}
+
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu_large(lat,osites);
+  return result;
+}
+
+NAMESPACE_END(Grid);
+
+/*
+template<class Double> Double svm_reduce(Double *vec,uint64_t L)
+{
+  Double sumResult; zeroit(sumResult);
+  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
+  Double identity;  zeroit(identity);
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
+     cgh.parallel_for(cl::sycl::range<1>{L},
+		      Reduction,
+		      [=] (cl::sycl::id<1> index, auto &sum) {
+	 sum +=vec[index];
+     });
+   });
+  theGridAccelerator->wait();
+  Double ret = d_sum[0];
+  free(d_sum,*theGridAccelerator);
+  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
+  return ret;
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_type  scalar;
+
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobjD;
+
+  sobjD ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int nsimd = vobj::Nsimd();
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Vector<scalar> buffer(osites*nsimd);
+  scalar *buf = &buffer[0];
+  vector *dat = (vector *)lat;
+
+  for(int w=0;w<words;w++) {
+
+    accelerator_for(ss,osites,nsimd,{
+	int lane = acceleratorSIMTlane(nsimd);
+	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
+    });
+    //Precision change at this point is to late to gain precision
+    ret_p[w] = svm_reduce(buf,nsimd*osites);
+  }
+  return ret;
+}
+*/
@@ -451,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
+#define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
+#ifdef FAST_FERM_TO_PROP
+  autoView(p_v,p,CpuWrite);
+  autoView(f_v,f,CpuRead);
+  thread_for(idx,p_v.oSites(),{
+      for(int ss = 0; ss < Ns; ++ss) {
+      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
+	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
+      }}
+    });
+#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
      pokeSpin(p, pjs, j, s);
    }
+#endif
 }
    
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
+#ifdef FAST_FERM_TO_PROP
+  autoView(p_v,p,CpuRead);
+  autoView(f_v,f,CpuWrite);
+  thread_for(idx,p_v.oSites(),{
+      for(int ss = 0; ss < Ns; ++ss) {
+      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
+	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
+      }}
+    });
+#else
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
@@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
      pokeSpin(f, fj, j);
    }
+#endif
 }
    
 //////////////////////////////////////////////
@@ -204,15 +204,18 @@ public:
  typedef WilsonCloverHelpers<Impl> Helpers;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;

-  static void MassTerm(CloverField& Clover, RealD diag_mass) {
+  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
    Clover += diag_mass;
  }

-  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
-                          CloverTriangleField& Triangle,
-                          RealD csw_t, RealD diag_mass) {
+  static void InvertClover(CloverField& InvClover,
+                            const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverDiagonalField&       diagonalInv,
+                            CloverTriangleField&       triangleInv,
+                            bool fixedBoundaries) {

-    // Do nothing
+    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
  }

  // TODO: implement Cmunu for better performances with compact layout, but don't do it
@@ -237,9 +240,17 @@ public:
  template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
  typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;

-  static void MassTerm(CloverField& Clover, RealD diag_mass) {
-    // do nothing!
-    // mass term is multiplied to exp(Clover) below
+  // Can this be avoided?
+  static void IdentityTimesC(const CloverField& in, RealD c) {
+    int DimRep = Impl::Dimension;
+
+    autoView(in_v, in, AcceleratorWrite);
+
+    accelerator_for(ss, in.Grid()->oSites(), 1, {
+      for (int sa=0; sa<Ns; sa++)
+        for (int ca=0; ca<DimRep; ca++)
+          in_v[ss]()(sa,sa)(ca,ca) = c;
+    });
  }

  static int getNMAX(RealD prec, RealD R) {
@@ -254,175 +265,62 @@ public:
    return NMAX;
  }

-  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
+  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
+  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}

-  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
+  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {

-  	  typedef iMatrix<ComplexD,6> mat;
+    GridBase* grid = Clover.Grid();
+    CloverField ExpClover(grid);

-  	  RealD qn[6];
-  	  RealD qnold[6];
-  	  RealD p[5];
-  	  RealD trA2, trA3, trA4;
+    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);

-  	  mat A2, A3, A4, A5;
-  	  A2 = alpha * alpha * arg * arg;
-  	  A3 = alpha * arg * A2;
-  	  A4 = A2 * A2;
-  	  A5 = A2 * A3;
+    Clover *= (1.0/diag_mass);

-  	  trA2 = toReal( trace(A2) );
-  	  trA3 = toReal( trace(A3) );
-  	  trA4 = toReal( trace(A4));
-
-  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
-  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
-  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
-  	  p[3] = trA3 / 3.0;
-  	  p[4] = 0.5 * trA2;
-
-  	  qnold[0] = cN[Niter];
-  	  qnold[1] = 0.0;
-  	  qnold[2] = 0.0;
-  	  qnold[3] = 0.0;
-  	  qnold[4] = 0.0;
-  	  qnold[5] = 0.0;
-
-  	  for(int i = Niter-1; i >= 0; i--)
-  	  {
-  	   qn[0] = p[0] * qnold[5] + cN[i];
-  	   qn[1] = p[1] * qnold[5] + qnold[0];
-  	   qn[2] = p[2] * qnold[5] + qnold[1];
-  	   qn[3] = p[3] * qnold[5] + qnold[2];
-  	   qn[4] = p[4] * qnold[5] + qnold[3];
-  	   qn[5] = qnold[4];
-
-  	   qnold[0] = qn[0];
-  	   qnold[1] = qn[1];
-  	   qnold[2] = qn[2];
-  	   qnold[3] = qn[3];
-  	   qnold[4] = qn[4];
-  	   qnold[5] = qn[5];
-  	  }
-
-  	  mat unit(1.0);
-
-  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
-
-    }
-
-  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
-
-    GridBase* grid = Diagonal.Grid();
-    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
-
-    //
-    // Implementation completely in Daniel's layout
-    //
-
-    // Taylor expansion with Cayley-Hamilton recursion
-    // underlying Horner scheme as above
+    // Taylor expansion, slow but generic
+    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
+    // qN = cN
+    // qn = cn + qn+1 X
    std::vector<RealD> cn(NMAX+1);
    cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++){
+    for (int i=1; i<=NMAX; i++)
      cn[i] = cn[i-1] / RealD(i);
-    }

-      // Taken over from Daniel's implementation
-      conformable(Diagonal, Triangle);
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * Clover + cn[i];

-      long lsites = grid->lSites();
-    {
-      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
-      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
-      typedef iMatrix<ComplexD,6> mat;
+    // prepare inverse
+    CloverInv = (-1.0)*Clover;

-      autoView(diagonal_v,  Diagonal,  CpuRead);
-      autoView(triangle_v,  Triangle,  CpuRead);
-      autoView(diagonalExp_v, Diagonal, CpuWrite);
-      autoView(triangleExp_v, Triangle, CpuWrite);
+    Clover = ExpClover * diag_mass;

-      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * CloverInv + cn[i];

-    	  mat srcCloverOpUL(0.0); // upper left block
-    	  mat srcCloverOpLR(0.0); // lower right block
-    	  mat ExpCloverOp;
+    CloverInv = ExpClover * (1.0/diag_mass);

-        scalar_object_diagonal diagonal_tmp     = Zero();
-        scalar_object_diagonal diagonal_exp_tmp = Zero();
-        scalar_object_triangle triangle_tmp     = Zero();
-        scalar_object_triangle triangle_exp_tmp = Zero();
-
-        Coordinate lcoor;
-        grid->LocalIndexToLocalCoor(site, lcoor);
-
-        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
-        peekLocalSite(triangle_tmp, triangle_v, lcoor);
-
-        int block;
-        block = 0;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-        		if (i == j){
-        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
-        		}
-        		else{
-        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
-        		}
-        	}
-        }
-        block = 1;
-        for(int i = 0; i < 6; i++){
-          	for(int j = 0; j < 6; j++){
-           		if (i == j){
-           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
-           		}
-           		else{
-           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
-           		}
-            }
-        }
-
-        // exp(Clover)
-
-        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
-
-        block = 0;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-            	if (i == j){
-            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
-            	}
-            	else if(i < j){
-            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
-            	}
-           	}
-        }
-
-        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
-
-        block = 1;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-              	if (i == j){
-              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
-               	}
-               	else if(i < j){
-               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
-               	}
-            }
-        }
-
-        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
-        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
-      });
-    }
-
-    Diagonal *= diag_mass;
-    Triangle *= diag_mass;
  }

+  static void InvertClover(CloverField& InvClover,
+                            const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverDiagonalField&       diagonalInv,
+                            CloverTriangleField&       triangleInv,
+                            bool fixedBoundaries) {
+
+    if (fixedBoundaries)
+    {
+      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
+    }
+    else
+    {
+      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
+    }
+  }

  static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
    assert(0);
@@ -225,7 +225,7 @@ public:
  RealD csw_t;
  RealD cF;

-  bool open_boundaries;
+  bool fixedBoundaries;

  CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
  CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
@@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
  , csw_r(_csw_r)
  , csw_t(_csw_t)
  , cF(_cF)
-  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
  , Diagonal(&Fgrid),        Triangle(&Fgrid)
  , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
  , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
@@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
    csw_r /= clover_anisotropy.xi_0;

  ImportGauge(_Umu);
-  if (open_boundaries) {
+  if (fixedBoundaries) {
    this->BoundaryMaskEven.Checkerboard() = Even;
    this->BoundaryMaskOdd.Checkerboard() = Odd;
    CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
@@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::Dhop(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopOE(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
  WilsonBase::DhopEO(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
  WilsonBase::DhopDir(in, out, dir, disp);
-  if(this->open_boundaries) ApplyBoundaryMask(out);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
  WilsonBase::DhopDirAll(in, out);
-  if(this->open_boundaries) {
+  if(this->fixedBoundaries) {
    for(auto& o : out) ApplyBoundaryMask(o);
  }
 }
@@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
  WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
  Mooee(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
@@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
  WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
  MooeeDag(in, Tmp);
  axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
  WilsonBase::Meooe(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
  WilsonBase::MeooeDag(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
@@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
  } else {
    MooeeInternal(in, out, Diagonal, Triangle);
  }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
@@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
  } else {
    MooeeInternal(in, out, DiagonalInv, TriangleInv);
  }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }

 template<class Impl, class CloverHelpers>
@@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField

 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!open_boundaries); // TODO check for changes required for open bc
+  assert(!fixedBoundaries); // TODO check for changes required for open bc

  // NOTE: code copied from original clover term
  conformable(X.Grid(), Y.Grid());
@@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  GridBase* grid = _Umu.Grid();
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
  CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);

  // Compute the field strength terms mu>nu
  double t2 = usecond();
@@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
  TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
  TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  // Handle mass term based on clover policy
-  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
-  
-  // Convert the data layout of the clover term
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
+
+  // Convert the data layout of the clover term
+  double t5 = usecond();
  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);

-  // Exponentiate the clover (nothing happens in case of the standard clover)
-  double t5 = usecond();
-  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
-
-  // Possible modify the boundary values
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
  double t6 = usecond();
-  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);

-  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
  double t7 = usecond();
-  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);

  // Fill the remaining clover fields
  double t8 = usecond();
@@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
  std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
  std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
  std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
  std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
  std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }
@@ -6,7 +6,7 @@

 source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

-export NT=16
+export NT=8

 export I_MPI_OFFLOAD=1
 export I_MPI_OFFLOAD_TOPOLIB=level_zero
@@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
 export ZE_AFFINITY_MASK=0.0
-mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
+mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768

 export ZE_AFFINITY_MASK=0
 export I_MPI_OFFLOAD_CELL=device
@@ -8,7 +8,6 @@ source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=16

-
 # export IGC_EnableLSCFenceUGMBeforeEOT=0
 # export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
 #export IGC_ShaderDumpEnable=1 
@@ -20,14 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0

-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.1x2.log
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.2x1.log
-
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
+for i in 0 
+do
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+done
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log


@@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/azusayax/install
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
-	--enable-comms=mpi \
+	--enable-comms=mpi-auto \
 	--disable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-unified=yes \
-	CXX=mpicxx \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=dpcpp \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"