Merge branch 'feature/ddhmc' of https://github.com/paboyle/Grid into feature/ddhmc

2026-05-14 22:24:30 +01:00 · 2021-05-06 23:42:39 +02:00
parent 919ced1c31 cff884929c
commit bd181b9481
15 changed files with 425 additions and 136 deletions
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  //      oss << std::put_time(&tm_, "%c %Z");
+  oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;
@@ -205,11 +205,20 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }
  // Preferred interface
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					std::string ens_label = std::string("DWF"))
  {
    writeConfiguration(Umu,file,0,1,ens_label);
  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32)
+					int bits32,
 					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
@@ -219,8 +228,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = "UKQCD";
+    header.ensemble_id     = std::string("UKQCD");
-    header.ensemble_label  = "DWF";
+    header.ensemble_label  = ens_label;
    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -232,7 +241,7 @@ public:
    GaugeStats Stats; Stats(Umu,header);
    MachineCharacteristics(header);
-	uint64_t offset;
+    uint64_t offset;
    // Sod it -- always write 3x3 double
    header.floating_point = std::string("IEEE64BIG");
@@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
 #ifndef GRID_CUDA
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
 typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
 #endif
 NAMESPACE_END(Grid);
 ////////////////////
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+// Deprecate Vec5d
-NAMESPACE_CHECK(ImplStaggered5dVec);  
+//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
 //NAMESPACE_CHECK(ImplStaggered5dVec);  
@@ -910,11 +910,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }
  std::vector<RealD> G_s(Ls,1.0);
  Integer sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
  else if ( curr_type == Current::Tadpole ) {
    auto b=this->_b;
    auto c=this->_c;
    if ( b == 1 && c == 0 ) {
      sign = -1;    
    }
    else {
      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
      assert(b==1 && c==0);
    }
  }
  for(int s=0;s<Ls;s++){
@@ -937,7 +949,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated
@@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
-
+#undef STAG_VEC5D
 #ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
 #endif
 }
-   
+#endif   
 #define PERMUTE_DIR3 __asm__ (	\
@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
-#define LOAD_CHI(b)		\
+#ifdef GRID_SIMT
 #define LOAD_CHI(ptype,b)			\
  const SiteSpinor & ref (b[offset]);				\
  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
 #define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
+  Chi_0=coalescedRead(ref()()(0),lane);	\
-    Chi_1=ref()()(1);\
+  Chi_1=coalescedRead(ref()()(1),lane);	\
-    Chi_2=ref()()(2);
+  Chi_2=coalescedRead(ref()()(2),lane);
 #define PERMUTE_DIR(dir)	;
 #else
 #define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
 #define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
  Chi_0=ref()()(0);			\
  Chi_1=ref()()(1);			\
  Chi_2=ref()()(2);
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_0,Chi_0);			\
  permute##dir(Chi_1,Chi_1);			\
  permute##dir(Chi_2,Chi_2);
 #endif
 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
+    U_00=coalescedRead(ref()(0,0),lane);				\
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
+    U_10=coalescedRead(ref()(1,0),lane);				\
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
+    U_20=coalescedRead(ref()(2,0),lane);				\
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
+    U_01=coalescedRead(ref()(0,1),lane);				\
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
+    U_11=coalescedRead(ref()(1,1),lane);				\
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
+    U_21=coalescedRead(ref()(2,1),lane);				\
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
+    U_02=coalescedRead(ref()(0,2),lane);				\
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
+    U_12=coalescedRead(ref()(1,2),lane);				\
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);
 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
+    U_00=coalescedRead(ref()(0,0),lane);				\
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
+    U_10=coalescedRead(ref()(1,0),lane);				\
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
+    U_20=coalescedRead(ref()(2,0),lane);				\
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
+    U_01=coalescedRead(ref()(0,1),lane);				\
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
+    U_11=coalescedRead(ref()(1,1),lane);				\
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
+    U_21=coalescedRead(ref()(2,1),lane);				\
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
+    U_02=coalescedRead(ref()(0,2),lane);				\
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
+    U_12=coalescedRead(ref()(1,2),lane);				\
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;
 #define PERMUTE_DIR(dir)			\
  permute##dir(Chi_0,Chi_0);			\
  permute##dir(Chi_1,Chi_1);			\
  permute##dir(Chi_2,Chi_2);
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in);					\
+    LOAD_CHI(Perm,in);						\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI(buf);						\
+    LOAD_CHI_COMMS(buf);					\
  }								
 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
  }
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in);				\
+    LOAD_CHI(Perm,in);					\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
+    LOAD_CHI_COMMS(buf);				\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
+    { LOAD_CHI_COMMS(buf);	  }				\
    { MULT_ADD(U,Dir,even); }					\
  }								
 #define HAND_DECLARATIONS(Simd) \
  Simd even_0;			\
  Simd even_1;			\
  Simd even_2;			\
  Simd odd_0;			\
  Simd odd_1;			\
  Simd odd_2;		        \
 		      		\
  Simd Chi_0;			\
  Simd Chi_1;			\
  Simd Chi_2;			\
 				\
  Simd U_00;			\
  Simd U_10;			\
  Simd U_20;			\
  Simd U_01;			\
  Simd U_11;			\
  Simd U_21;			\
  Simd U_02;			\
  Simd U_12;			\
  Simd U_22;			
 template <class Impl>
 template <int Naik> accelerator_inline
@@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  Simd even_0; // 12 regs on knc
  Simd even_1;
  Simd even_2;
  Simd odd_0; // 12 regs on knc
  Simd odd_1;
  Simd odd_2;
-  Simd Chi_0;    // two spinor; 6 regs
+  const int Nsimd = SiteHalfSpinor::Nsimd();
-  Simd Chi_1;
+  const int lane=acceleratorSIMTlane(Nsimd);
-  Simd Chi_2;
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
-  
+  HAND_DECLARATIONS(Simt);
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
  calcSiteSpinor result;
  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }
@@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
-  Simd even_0; // 12 regs on knc
+  const int Nsimd = SiteHalfSpinor::Nsimd();
-  Simd even_1;
+  const int lane=acceleratorSIMTlane(Nsimd);
-  Simd even_2;
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
-  Simd odd_0; // 12 regs on knc
+  HAND_DECLARATIONS(Simt);
  Simd odd_1;
  Simd odd_2;
-  Simd Chi_0;    // two spinor; 6 regs
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
-  Simd Chi_1;
+  calcSiteSpinor result;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset, ptype, local, perm;
  StencilEntry *SE;
@@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  //    int sF=s+LLs*sU;
  {
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }
@@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
-  Simd even_0; // 12 regs on knc
+  const int Nsimd = SiteHalfSpinor::Nsimd();
-  Simd even_1;
+  const int lane=acceleratorSIMTlane(Nsimd);
-  Simd even_2;
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
-  Simd odd_0; // 12 regs on knc
+  HAND_DECLARATIONS(Simt);
  Simd odd_1;
  Simd odd_2;
-  Simd Chi_0;    // two spinor; 6 regs
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
-  Simd Chi_1;
+  calcSiteSpinor result;
  Simd Chi_2;
  Simd U_00;  // two rows of U matrix
  Simd U_10;
  Simd U_20;  
  Simd U_01;
  Simd U_11;
  Simd U_21;  // 2 reg left.
  Simd U_02;
  Simd U_12;
  Simd U_22; 
  SiteSpinor result;
  int offset, ptype, local;
  StencilEntry *SE;
@@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  //    int sF=s+LLs*sU;
  {
-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out[sF] = out[sF] + result;
+      coalescedWrite(out[sF] , out(sF)+ result);
    }
  }
 }
@@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
 #undef HAND_DECLARATIONS
 NAMESPACE_END(Grid);
@@ -0,0 +1,236 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
 #define QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
 NAMESPACE_BEGIN(Grid);
 ///////////////////////////////////////
 // Two flavour ratio
 ///////////////////////////////////////
 template<class Impl>
 class DomainBoundaryPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);
 private:
  FermionOperator<Impl> & NumOp;// the basic operator
  FermionOperator<Impl> & DenOp;// the basic operator
  FermionOperator<Impl> & NumOpDirichlet;// the basic operator
  FermionOperator<Impl> & DenOpDirichlet;// the basic operator
  OperatorFunction<FermionField> &DerivativeSolver;
  OperatorFunction<FermionField> &ActionSolver;
  FermionField Phi; // the pseudo fermion field for this trajectory
  Coordinate Block;
  typedef Lattice<iLorentzVector<Simd> > LinkMask;
  LinkMask ActiveLinks;
  LinkMask PassiveLinks;
  //  FermionField BoundaryMask;
  //  FermionField BoundaryMask;
 public:
  DomainBoundaryPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
 				    FermionOperator<Impl>  &_DenOp,
 				    FermionOperator<Impl>  &_NumOpDirichlet, 
 				    FermionOperator<Impl>  &_DenOpDirichlet, 
 				    OperatorFunction<FermionField> & DS,
 				    OperatorFunction<FermionField> & AS,
 				    Coordinate &_Block
 				    ) : NumOp(_NumOp), DenOp(_DenOp),
 					DerivativeSolver(DS), ActionSolver(AS),
 					Phi(_NumOp.FermionGrid()), Block(_Block) {};
  virtual std::string action_name(){return "DomainBoundaryPseudoFermionRatioAction";}
  virtual std::string LogParameters(){
    std::stringstream sstream;
    sstream << GridLogMessage << "["<<action_name()<<"] Block "<<_Block << std::endl;
    return sstream.str();
  }  
  void Tests(void)
  {
    // Possible checks
    // Pdbar^2 = Pdbar etc..
    // ProjectOmega + ProjectOmegabar = 1;
    // dBoundary Pdbar = dBoundary
    // dOmega, dOmega vs Omega project and d.
  }
  void ProjectBoundary   (FermionField &f)  {    assert(0);  };
  void ProjectBoundaryBar(FermionField &f)  {    assert(0);  };
  void ProjectOmega      (FermionField &f)  {    assert(0);  };
  void ProjectOmegaBar   (FermionField &f)  {    assert(0);  };
  void dInverse     (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void dBoundaryBar (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void dBoundary    (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void dOmega       (FermionOperator<Impl>  &Op,FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void dOmegaBar    (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void SolveOmega   (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  void SolveOmegaBar(FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
  // R = 1 - Pdbar DomegaInv Dd DomegabarInv Ddbar
  void R(FermionOperator<Impl>  &Op,FermionOperator<Impl>  &OpDirichlet,FermionField &in,FermionField &out)
  {
    FermionField tmp1(Op.FermionGrid());
    FermionField tmp2(Op.FermionGrid());
    dBoundaryBar(Op,in,tmp1);
    SolveOmegaBar(OpDirichlet,tmp1,tmp2); // 1/2 cost
    dBoundary(Op,tmp2,tmp1);
    SolveOmega(OpDirichlet,tmp1,tmp2); // 1/2 cost
    ProjectBoundaryBar(tmp2);
    out = in - tmp2 ;
  };
  // R = Pdbar - Pdbar Dinv Ddbar 
  void Rinverse(FermionField &in,FermionField &out)
  {
    FermionField tmp1(NumOp.FermionGrid());
    out = in;
    ProjectBoundaryBar(out);
    dInverse(out,tmp1);
    ProjectBoundaryBar(tmp1);
    out = out -tmp1;
  };
  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG)
  {
    // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
    //
    // NumOp == V
    // DenOp == M
    //
    // Take phi = Vdag^{-1} Mdag eta  ; eta = Mdag^{-1} Vdag Phi
    //
    // P(eta) = e^{- eta^dag eta}
    //
    // e^{x^2/2 sig^2} => sig^2 = 0.5.
    // 
    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
    //
    RealD scale = std::sqrt(0.5);
    FermionField eta(NumOp.FermionGrid());
    FermionField tmp(NumOp.FermionGrid());
    gaussian(pRNG,eta);
    ProjectBoundary(eta);
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    // Note: this hard codes normal equations type solvers; alternate implementation needed for 
    // non-herm style solvers.
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
    DenOp.Mdag(eta,Phi);            // Mdag eta
    tmp = Zero();
    ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
    NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
    Phi=Phi*scale;
  };
  //////////////////////////////////////////////////////
  // S = phi^dag V (Mdag M)^-1 Vdag phi
  //////////////////////////////////////////////////////
  virtual RealD S(const GaugeField &U) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    FermionField X(NumOp.FermionGrid());
    FermionField Y(NumOp.FermionGrid());
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    NumOp.Mdag(Phi,Y);              // Y= Vdag phi
    X=Zero();
    ActionSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
    DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
    RealD action = norm2(Y);
    return action;
  };
  //////////////////////////////////////////////////////
  // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
  //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
  //       + phi^dag V (Mdag M)^-1 dV^dag  phi
  //////////////////////////////////////////////////////
  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
    NumOp.ImportGauge(U);
    DenOp.ImportGauge(U);
    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
    FermionField  X(NumOp.FermionGrid());
    FermionField  Y(NumOp.FermionGrid());
    GaugeField   force(NumOp.GaugeGrid());	
    //Y=Vdag phi
    //X = (Mdag M)^-1 V^dag phi
    //Y = (Mdag)^-1 V^dag  phi
    NumOp.Mdag(Phi,Y);              // Y= Vdag phi
    X=Zero();
    DerivativeSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
    DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
    // phi^dag V (Mdag M)^-1 dV^dag  phi
    NumOp.MDeriv(force , X, Phi, DaggerYes );  dSdU=force;
    // phi^dag dV (Mdag M)^-1 V^dag  phi
    NumOp.MDeriv(force , Phi, X ,DaggerNo  );  dSdU=dSdU+force;
    //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
    //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
    DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
    DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
    dSdU *= -1.0;
    //dSdU = - Ta(dSdU);
  };
 };
 NAMESPACE_END(Grid);
 #endif
@@ -97,7 +97,19 @@ public:
    tmp = Zero();
    ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
    NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
-
+#define FILTER
 #ifdef FILTER
    Integer OrthogDir=0;
    Integer plane=0;
    if ( getenv("DIR") ) OrthogDir = atoi(getenv("DIR"));
    if ( getenv("COOR") ) plane = atoi(getenv("COOR"));
    std::cout << " *** PseudoFermion FILTER DIR " <<OrthogDir << " plane "<<plane<<std::endl;
    Lattice<iScalar<vInteger> >  coor(NumOp.FermionGrid());
    LatticeCoordinate(coor,OrthogDir);
    tmp = Zero();
    Phi = where(coor==plane,Phi,tmp);
 #endif
    Phi=Phi*scale;
  };
@@ -165,6 +177,25 @@ public:
    dSdU *= -1.0;
    //dSdU = - Ta(dSdU);
 #ifdef FILTER
    std::cout <<" In force  "<<std::endl;
    force = dSdU;
    int mu=0;
    std::cout << " FORCE mu " <<mu<<" L2 "<< norm2(force)<< " Linf " << maxLocalNorm2(force)<<std::endl;
    int plane=0;
    if ( getenv("COOR") ) plane = atoi(getenv("COOR"));
    Lattice<iScalar<vInteger> >  coor(NumOp.GaugeGrid());
    LatticeCoordinate(coor,mu);
    int L = NumOp.GaugeGrid()->FullDimensions()[mu];
    for (Integer p=0;p<L;p++) {
      force = Zero();
      force = where(coor==p,dSdU,force);
      std::cout << " FORCE mu " <<mu<<" PF plane "<<plane<<" T= " <<p<<" L2 "<< norm2(force)<< " Linf " << maxLocalNorm2(force)<<std::endl;
    }
    exit(0);
 #endif
  };
 };
@@ -85,21 +85,18 @@ public:
    std::cout << GridLogDebug << "Stout smearing started\n";
-    // Smear the configurations
+    // C contains the staples multiplied by some rho
    u_smr = U ; // set the smeared field to the current gauge field
    SmearBase->smear(C, U);
    for (int mu = 0; mu < Nd; mu++) {
-      if( mu == OrthogDim )
+      if( mu == OrthogDim ) continue ;
-        tmp = 1.0;  // Don't smear in the orthogonal direction
+      // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
-      else {
+      Umu = peekLorentz(U, mu);
-        tmp = peekLorentz(C, mu);
+      tmp = peekLorentz(C, mu);
-        Umu = peekLorentz(U, mu);
+      iq_mu = Ta( tmp * adj(Umu));  
-        iq_mu = Ta(
+      exponentiate_iQ(tmp, iq_mu);
-                   tmp *
+      pokeLorentz(u_smr, tmp * Umu, mu);
                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
        exponentiate_iQ(tmp, iq_mu);
      }
      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
@@ -65,7 +65,8 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 #else
-#ifndef GRID_SYCL
+//#ifndef GRID_SYCL
 #if 1
 // Use the scalar as our own complex on GPU ... thrust::complex or std::complex
 template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
 typename vsimd::scalar_type
@@ -1,5 +1,11 @@
-- comms threads issue??
+--
-- Part done: Staggered kernel performance on GPU
+-- Comms threads issue??
 -- Part done: Staggered kernel performance on GPU ; eliminate replicas
 -- Antonin - Nd, Nc generic hide and make Gimpl
 -- DWF 5d RB case / Shamir
 -- 4D pseudofermion options
 -- DDHMC
 --
 =========================================================
 General
@@ -92,6 +92,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
    Meofa.refresh(Umu, SRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
@@ -57,8 +57,10 @@ int main (int argc, char ** argv)
  double beta = 1.0;
  double c1   = 0.331;
  const int nu = 1;
  std::vector<int> twists(Nd,0);
-  twists[1] = 0;
+  twists[nu] = 1;
  ConjugateGimplD::setDirections(twists);
  ConjugatePlaqPlusRectangleActionR Action(beta,c1);
  //ConjugateWilsonGaugeActionR Action(beta);
@@ -63,6 +63,7 @@ int main (int argc, char ** argv)
  GridSerialRNG            sRNG; 
  GridParallelRNG          pRNG(&Grid);
  GridSerialRNG            sRNG;
  pRNG.SeedFixedIntegers(seeds);
  sRNG.SeedFixedIntegers(serial_seeds);