Merge branch 'feature/ddhmc' of https://github.com/paboyle/Grid into feature/ddhmc

2026-05-31 06:24:18 +01:00 · 2021-05-06 23:42:39 +02:00
parent 919ced1c31 cff884929c
commit bd181b9481
15 changed files with 425 additions and 136 deletions
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  //      oss << std::put_time(&tm_, "%c %Z");
+  oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;

@@ -205,11 +205,20 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

+  // Preferred interface
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
+					std::string file, 
+					std::string ens_label = std::string("DWF"))
+  {
+    writeConfiguration(Umu,file,0,1,ens_label);
+  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32)
+					int bits32,
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
@@ -219,8 +228,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = "UKQCD";
-    header.ensemble_label  = "DWF";
+    header.ensemble_id     = std::string("UKQCD");
+    header.ensemble_label  = ens_label;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

-#ifndef GRID_CUDA
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
-#endif
-
 NAMESPACE_END(Grid);

 ////////////////////
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
-NAMESPACE_CHECK(ImplStaggered5dVec);  
+// Deprecate Vec5d
+//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+//NAMESPACE_CHECK(ImplStaggered5dVec);  


@@ -910,11 +910,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
+  Integer sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
+  else if ( curr_type == Current::Tadpole ) {
+    auto b=this->_b;
+    auto c=this->_c;
+    if ( b == 1 && c == 0 ) {
+      sign = -1;    
+    }
+    else {
+      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
+      assert(b==1 && c==0);
+    }
+  }

  for(int s=0;s<Ls;s++){

@@ -937,7 +949,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated

@@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
-
+#undef STAG_VEC5D
+#ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
 #endif
 }
   
-   
+#endif   


 #define PERMUTE_DIR3 __asm__ (	\
@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-#define LOAD_CHI(b)		\
+#ifdef GRID_SIMT
+
+#define LOAD_CHI(ptype,b)			\
  const SiteSpinor & ref (b[offset]);				\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
+  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
+  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
+  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=coalescedRead(ref()()(0),lane);	\
+  Chi_1=coalescedRead(ref()()(1),lane);	\
+  Chi_2=coalescedRead(ref()()(2),lane);
+
+#define PERMUTE_DIR(dir)	;
+#else
+#define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=ref()()(0);			\
+  Chi_1=ref()()(1);			\
  Chi_2=ref()()(2);

+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+#endif
+

 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);

 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;


-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in);					\
+    LOAD_CHI(Perm,in);						\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI(buf);						\
+    LOAD_CHI_COMMS(buf);					\
  }								

 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
  }


-
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in);				\
+    LOAD_CHI(Perm,in);					\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
+    LOAD_CHI_COMMS(buf);				\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
+    { LOAD_CHI_COMMS(buf);	  }				\
    { MULT_ADD(U,Dir,even); }					\
  }								

+#define HAND_DECLARATIONS(Simd) \
+  Simd even_0;			\
+  Simd even_1;			\
+  Simd even_2;			\
+  Simd odd_0;			\
+  Simd odd_1;			\
+  Simd odd_2;		        \
+		      		\
+  Simd Chi_0;			\
+  Simd Chi_1;			\
+  Simd Chi_2;			\
+				\
+  Simd U_00;			\
+  Simd U_10;			\
+  Simd U_20;			\
+  Simd U_01;			\
+  Simd U_11;			\
+  Simd U_21;			\
+  Simd U_02;			\
+  Simd U_12;			\
+  Simd U_22;			
+  

 template <class Impl>
 template <int Naik> accelerator_inline
@@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset,local,perm, ptype;

  StencilEntry *SE;
@@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local, perm;

  StencilEntry *SE;
@@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local;

  StencilEntry *SE;
@@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out[sF] = out[sF] + result;
+      coalescedWrite(out[sF] , out(sF)+ result);
    }
  }
 }
@@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
+#undef HAND_DECLARATIONS

 NAMESPACE_END(Grid);

@@ -0,0 +1,236 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/pseudofermion/TwoFlavourRatio.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+#define QCD_PSEUDOFERMION_TWO_FLAVOUR_RATIO_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////
+// Two flavour ratio
+///////////////////////////////////////
+template<class Impl>
+class DomainBoundaryPseudoFermionAction : public Action<typename Impl::GaugeField> {
+public:
+  INHERIT_IMPL_TYPES(Impl);
+
+private:
+  FermionOperator<Impl> & NumOp;// the basic operator
+  FermionOperator<Impl> & DenOp;// the basic operator
+  FermionOperator<Impl> & NumOpDirichlet;// the basic operator
+  FermionOperator<Impl> & DenOpDirichlet;// the basic operator
+
+  OperatorFunction<FermionField> &DerivativeSolver;
+  OperatorFunction<FermionField> &ActionSolver;
+
+  FermionField Phi; // the pseudo fermion field for this trajectory
+  
+  Coordinate Block;
+  typedef Lattice<iLorentzVector<Simd> > LinkMask;
+
+  LinkMask ActiveLinks;
+  LinkMask PassiveLinks;
+
+  //  FermionField BoundaryMask;
+  //  FermionField BoundaryMask;
+
+public:
+  DomainBoundaryPseudoFermionAction(FermionOperator<Impl>  &_NumOp, 
+				    FermionOperator<Impl>  &_DenOp,
+				    FermionOperator<Impl>  &_NumOpDirichlet, 
+				    FermionOperator<Impl>  &_DenOpDirichlet, 
+				    OperatorFunction<FermionField> & DS,
+				    OperatorFunction<FermionField> & AS,
+				    Coordinate &_Block
+				    ) : NumOp(_NumOp), DenOp(_DenOp),
+					DerivativeSolver(DS), ActionSolver(AS),
+					Phi(_NumOp.FermionGrid()), Block(_Block) {};
+      
+  virtual std::string action_name(){return "DomainBoundaryPseudoFermionRatioAction";}
+
+  virtual std::string LogParameters(){
+    std::stringstream sstream;
+    sstream << GridLogMessage << "["<<action_name()<<"] Block "<<_Block << std::endl;
+    return sstream.str();
+  }  
+
+  void Tests(void)
+  {
+    // Possible checks
+    // Pdbar^2 = Pdbar etc..
+    // ProjectOmega + ProjectOmegabar = 1;
+    // dBoundary Pdbar = dBoundary
+    // dOmega, dOmega vs Omega project and d.
+  }
+  void ProjectBoundary   (FermionField &f)  {    assert(0);  };
+  void ProjectBoundaryBar(FermionField &f)  {    assert(0);  };
+  void ProjectOmega      (FermionField &f)  {    assert(0);  };
+  void ProjectOmegaBar   (FermionField &f)  {    assert(0);  };
+
+  void dInverse     (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void dBoundaryBar (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void dBoundary    (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void dOmega       (FermionOperator<Impl>  &Op,FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void dOmegaBar    (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void SolveOmega   (FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+  void SolveOmegaBar(FermionOperator<Impl>  &Op,FermionField &in,FermionField &out){    assert(0);  };
+
+  // R = 1 - Pdbar DomegaInv Dd DomegabarInv Ddbar
+  void R(FermionOperator<Impl>  &Op,FermionOperator<Impl>  &OpDirichlet,FermionField &in,FermionField &out)
+  {
+    FermionField tmp1(Op.FermionGrid());
+    FermionField tmp2(Op.FermionGrid());
+    dBoundaryBar(Op,in,tmp1);
+    SolveOmegaBar(OpDirichlet,tmp1,tmp2); // 1/2 cost
+    dBoundary(Op,tmp2,tmp1);
+    SolveOmega(OpDirichlet,tmp1,tmp2); // 1/2 cost
+    ProjectBoundaryBar(tmp2);
+    out = in - tmp2 ;
+  };
+  
+  // R = Pdbar - Pdbar Dinv Ddbar 
+  void Rinverse(FermionField &in,FermionField &out)
+  {
+    FermionField tmp1(NumOp.FermionGrid());
+    out = in;
+    ProjectBoundaryBar(out);
+    dInverse(out,tmp1);
+    ProjectBoundaryBar(tmp1);
+    out = out -tmp1;
+  };
+  
+  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG)
+  {
+    // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
+    //
+    // NumOp == V
+    // DenOp == M
+    //
+    // Take phi = Vdag^{-1} Mdag eta  ; eta = Mdag^{-1} Vdag Phi
+    //
+    // P(eta) = e^{- eta^dag eta}
+    //
+    // e^{x^2/2 sig^2} => sig^2 = 0.5.
+    // 
+    // So eta should be of width sig = 1/sqrt(2) and must multiply by 0.707....
+    //
+    RealD scale = std::sqrt(0.5);
+
+    FermionField eta(NumOp.FermionGrid());
+    FermionField tmp(NumOp.FermionGrid());
+
+    gaussian(pRNG,eta);
+
+    ProjectBoundary(eta);
+    
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    // Note: this hard codes normal equations type solvers; alternate implementation needed for 
+    // non-herm style solvers.
+    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(NumOp);
+
+    DenOp.Mdag(eta,Phi);            // Mdag eta
+    tmp = Zero();
+    ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
+    NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
+
+    Phi=Phi*scale;
+	
+  };
+
+  //////////////////////////////////////////////////////
+  // S = phi^dag V (Mdag M)^-1 Vdag phi
+  //////////////////////////////////////////////////////
+  virtual RealD S(const GaugeField &U) {
+
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    FermionField X(NumOp.FermionGrid());
+    FermionField Y(NumOp.FermionGrid());
+	
+    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+    NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+    X=Zero();
+    ActionSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+    DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+    RealD action = norm2(Y);
+
+    return action;
+  };
+
+  //////////////////////////////////////////////////////
+  // dS/du = phi^dag dV (Mdag M)^-1 V^dag  phi
+  //       - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ]  (Mdag M)^-1 V^dag  phi
+  //       + phi^dag V (Mdag M)^-1 dV^dag  phi
+  //////////////////////////////////////////////////////
+  virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
+
+    NumOp.ImportGauge(U);
+    DenOp.ImportGauge(U);
+
+    MdagMLinearOperator<FermionOperator<Impl> ,FermionField> MdagMOp(DenOp);
+
+    FermionField  X(NumOp.FermionGrid());
+    FermionField  Y(NumOp.FermionGrid());
+
+    GaugeField   force(NumOp.GaugeGrid());	
+
+
+    //Y=Vdag phi
+    //X = (Mdag M)^-1 V^dag phi
+    //Y = (Mdag)^-1 V^dag  phi
+    NumOp.Mdag(Phi,Y);              // Y= Vdag phi
+    X=Zero();
+    DerivativeSolver(MdagMOp,Y,X);      // X= (MdagM)^-1 Vdag phi
+    DenOp.M(X,Y);                  // Y=  Mdag^-1 Vdag phi
+
+    // phi^dag V (Mdag M)^-1 dV^dag  phi
+    NumOp.MDeriv(force , X, Phi, DaggerYes );  dSdU=force;
+  
+    // phi^dag dV (Mdag M)^-1 V^dag  phi
+    NumOp.MDeriv(force , Phi, X ,DaggerNo  );  dSdU=dSdU+force;
+
+    //    -    phi^dag V (Mdag M)^-1 Mdag dM   (Mdag M)^-1 V^dag  phi
+    //    -    phi^dag V (Mdag M)^-1 dMdag M   (Mdag M)^-1 V^dag  phi
+    DenOp.MDeriv(force,Y,X,DaggerNo);   dSdU=dSdU-force;
+    DenOp.MDeriv(force,X,Y,DaggerYes);  dSdU=dSdU-force;
+
+    dSdU *= -1.0;
+    //dSdU = - Ta(dSdU);
+
+  };
+};
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -97,6 +97,18 @@ public:
    tmp = Zero();
    ActionSolver(MdagMOp,Phi,tmp);  // (VdagV)^-1 Mdag eta = V^-1 Vdag^-1 Mdag eta
    NumOp.M(tmp,Phi);               // Vdag^-1 Mdag eta
+#define FILTER
+#ifdef FILTER
+    Integer OrthogDir=0;
+    Integer plane=0;
+    if ( getenv("DIR") ) OrthogDir = atoi(getenv("DIR"));
+    if ( getenv("COOR") ) plane = atoi(getenv("COOR"));
+    std::cout << " *** PseudoFermion FILTER DIR " <<OrthogDir << " plane "<<plane<<std::endl;
+    Lattice<iScalar<vInteger> >  coor(NumOp.FermionGrid());
+    LatticeCoordinate(coor,OrthogDir);
+    tmp = Zero();
+    Phi = where(coor==plane,Phi,tmp);
+#endif
    
    Phi=Phi*scale;
 	
@@ -165,6 +177,25 @@ public:
    dSdU *= -1.0;
    //dSdU = - Ta(dSdU);

+#ifdef FILTER
+    std::cout <<" In force  "<<std::endl;
+    force = dSdU;
+    int mu=0;
+    std::cout << " FORCE mu " <<mu<<" L2 "<< norm2(force)<< " Linf " << maxLocalNorm2(force)<<std::endl;
+    int plane=0;
+    if ( getenv("COOR") ) plane = atoi(getenv("COOR"));
+    Lattice<iScalar<vInteger> >  coor(NumOp.GaugeGrid());
+
+    LatticeCoordinate(coor,mu);
+    int L = NumOp.GaugeGrid()->FullDimensions()[mu];
+    for (Integer p=0;p<L;p++) {
+      force = Zero();
+      force = where(coor==p,dSdU,force);
+      std::cout << " FORCE mu " <<mu<<" PF plane "<<plane<<" T= " <<p<<" L2 "<< norm2(force)<< " Linf " << maxLocalNorm2(force)<<std::endl;
+    }
+    exit(0);
+#endif
+    
  };
 };

@@ -85,21 +85,18 @@ public:

    std::cout << GridLogDebug << "Stout smearing started\n";

-    // Smear the configurations
+    // C contains the staples multiplied by some rho
+    u_smr = U ; // set the smeared field to the current gauge field
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      if( mu == OrthogDim )
-        tmp = 1.0;  // Don't smear in the orthogonal direction
-      else {
-        tmp = peekLorentz(C, mu);
+      if( mu == OrthogDim ) continue ;
+      // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
      Umu = peekLorentz(U, mu);
-        iq_mu = Ta(
-                   tmp *
-                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+      tmp = peekLorentz(C, mu);
+      iq_mu = Ta( tmp * adj(Umu));  
      exponentiate_iQ(tmp, iq_mu);
-      }
-      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+      pokeLorentz(u_smr, tmp * Umu, mu);
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
@@ -65,7 +65,8 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 #else


-#ifndef GRID_SYCL
+//#ifndef GRID_SYCL
+#if 1
 // Use the scalar as our own complex on GPU ... thrust::complex or std::complex
 template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
 typename vsimd::scalar_type
@@ -1,5 +1,11 @@
-- comms threads issue??
-- Part done: Staggered kernel performance on GPU
+--
+-- Comms threads issue??
+-- Part done: Staggered kernel performance on GPU ; eliminate replicas
+-- Antonin - Nd, Nc generic hide and make Gimpl
+-- DWF 5d RB case / Shamir
+-- 4D pseudofermion options
+-- DDHMC
+--

 =========================================================
 General
@@ -92,6 +92,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);

+
    Meofa.refresh(Umu, SRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }
@@ -57,8 +57,10 @@ int main (int argc, char ** argv)
  double beta = 1.0;
  double c1   = 0.331;

+  const int nu = 1;
  std::vector<int> twists(Nd,0);
-  twists[1] = 0;
+  twists[nu] = 1;
+
  ConjugateGimplD::setDirections(twists);
  ConjugatePlaqPlusRectangleActionR Action(beta,c1);
  //ConjugateWilsonGaugeActionR Action(beta);
@@ -63,6 +63,7 @@ int main (int argc, char ** argv)

  GridSerialRNG            sRNG; 
  GridParallelRNG          pRNG(&Grid);
+  GridSerialRNG            sRNG;
  pRNG.SeedFixedIntegers(seeds);
  sRNG.SeedFixedIntegers(serial_seeds);