mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/staggered-comms-compute
Conflicts: lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
This commit is contained in:
		@@ -52,6 +52,35 @@ namespace QCD {
 | 
			
		||||
 { 
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
// Physical surface field utilities
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
 | 
			
		||||
{
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  FermionField tmp(this->FermionGrid());
 | 
			
		||||
  tmp = solution5d;
 | 
			
		||||
  conformable(solution5d._grid,this->FermionGrid());
 | 
			
		||||
  conformable(exported4d._grid,this->GaugeGrid());
 | 
			
		||||
  axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
 | 
			
		||||
  axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1);
 | 
			
		||||
  ExtractSlice(exported4d, tmp, 0, 0);
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 | 
			
		||||
{
 | 
			
		||||
  int Ls = this->Ls;
 | 
			
		||||
  FermionField tmp(this->FermionGrid());
 | 
			
		||||
  conformable(imported5d._grid,this->FermionGrid());
 | 
			
		||||
  conformable(input4d._grid   ,this->GaugeGrid());
 | 
			
		||||
  tmp = zero;
 | 
			
		||||
  InsertSlice(input4d, tmp, 0   , 0);
 | 
			
		||||
  InsertSlice(input4d, tmp, Ls-1, 0);
 | 
			
		||||
  axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
 | 
			
		||||
  axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
 | 
			
		||||
  Dminus(tmp,imported5d);
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
@@ -73,7 +102,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
 | 
			
		||||
  this->DW(psi,tmp_f,DaggerYes);
 | 
			
		||||
 | 
			
		||||
  for(int s=0;s<Ls;s++){
 | 
			
		||||
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
 | 
			
		||||
    axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -83,8 +83,13 @@ namespace Grid {
 | 
			
		||||
      virtual void   M5D   (const FermionField &psi, FermionField &chi);
 | 
			
		||||
      virtual void   M5Ddag(const FermionField &psi, FermionField &chi);
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Physical surface field utilities
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      virtual void   Dminus(const FermionField &psi, FermionField &chi);
 | 
			
		||||
      virtual void   DminusDag(const FermionField &psi, FermionField &chi);
 | 
			
		||||
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
 | 
			
		||||
      virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
 
 | 
			
		||||
@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
 | 
			
		||||
	}
 | 
			
		||||
	a0 = a0+incr;
 | 
			
		||||
	a1 = a1+incr;
 | 
			
		||||
	a2 = a2+sizeof(Simd::scalar_type);
 | 
			
		||||
	a2 = a2+sizeof(typename Simd::scalar_type);
 | 
			
		||||
      }}
 | 
			
		||||
    {
 | 
			
		||||
      int lexa = s1+LLs*site;
 | 
			
		||||
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
 | 
			
		||||
	}
 | 
			
		||||
	a0 = a0+incr;
 | 
			
		||||
	a1 = a1+incr;
 | 
			
		||||
	a2 = a2+sizeof(Simd::scalar_type);
 | 
			
		||||
	a2 = a2+sizeof(typename Simd::scalar_type);
 | 
			
		||||
      }}
 | 
			
		||||
    {
 | 
			
		||||
      int lexa = s1+LLs*site;
 | 
			
		||||
 
 | 
			
		||||
@@ -295,6 +295,27 @@ namespace Grid {
 | 
			
		||||
      assert((Ls&0x1)==1); // Odd Ls required
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      conformable(solution5d._grid,this->FermionGrid());
 | 
			
		||||
      conformable(exported4d._grid,this->GaugeGrid());
 | 
			
		||||
      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      conformable(imported5d._grid,this->FermionGrid());
 | 
			
		||||
      conformable(input4d._grid   ,this->GaugeGrid());
 | 
			
		||||
      FermionField tmp(this->FermionGrid());
 | 
			
		||||
      tmp=zero;
 | 
			
		||||
      InsertSlice(input4d, tmp, Ls-1, Ls-1);
 | 
			
		||||
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
 | 
			
		||||
      this->Dminus(tmp,imported5d);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    FermOpTemplateInstantiate(ContinuedFractionFermion5D);
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -65,6 +65,14 @@ namespace Grid {
 | 
			
		||||
      // Efficient support for multigrid coarsening
 | 
			
		||||
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Physical surface field utilities
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case
 | 
			
		||||
      //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case
 | 
			
		||||
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
 | 
			
		||||
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      ContinuedFractionFermion5D(GaugeField &_Umu,
 | 
			
		||||
				 GridCartesian         &FiveDimGrid,
 | 
			
		||||
 
 | 
			
		||||
@@ -475,7 +475,7 @@ namespace QCD {
 | 
			
		||||
                        }
 | 
			
		||||
                        a0 = a0 + incr;
 | 
			
		||||
                        a1 = a1 + incr;
 | 
			
		||||
                        a2 = a2 + sizeof(Simd::scalar_type);
 | 
			
		||||
                        a2 = a2 + sizeof(typename Simd::scalar_type);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h>     // 4d wilson like
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
 | 
			
		||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types
 | 
			
		||||
//#include <Grid/qcd/action/fermion/CloverFermion.h>
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
@@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
 | 
			
		||||
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
 | 
			
		||||
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
 | 
			
		||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
 | 
			
		||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
 | 
			
		||||
 | 
			
		||||
// Twisted mass fermion
 | 
			
		||||
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
 | 
			
		||||
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
 | 
			
		||||
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 | 
			
		||||
 | 
			
		||||
// Clover fermions
 | 
			
		||||
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
 | 
			
		||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
 | 
			
		||||
 | 
			
		||||
// Domain Wall fermions
 | 
			
		||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 | 
			
		||||
 
 | 
			
		||||
@@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#define TwoIndexFermOpTemplateInstantiate(A) \
 | 
			
		||||
  template class A<WilsonTwoIndexSymmetricImplF>; \
 | 
			
		||||
  template class A<WilsonTwoIndexSymmetricImplD>; 
 | 
			
		||||
  template class A<WilsonTwoIndexSymmetricImplD>; \
 | 
			
		||||
  template class A<WilsonTwoIndexAntiSymmetricImplF>; \
 | 
			
		||||
  template class A<WilsonTwoIndexAntiSymmetricImplD>;
 | 
			
		||||
 | 
			
		||||
#define FermOp5dVecTemplateInstantiate(A) \
 | 
			
		||||
  template class A<DomainWallVec5dImplF>;	\
 | 
			
		||||
 
 | 
			
		||||
@@ -47,6 +47,7 @@ namespace Grid {
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
 | 
			
		||||
      virtual ~FermionOperator(void) = default;
 | 
			
		||||
 | 
			
		||||
      virtual FermionField &tmp(void) = 0;
 | 
			
		||||
 | 
			
		||||
@@ -115,6 +116,34 @@ namespace Grid {
 | 
			
		||||
      ///////////////////////////////////////////////
 | 
			
		||||
      virtual void ImportGauge(const GaugeField & _U)=0;
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////////////////////////////////
 | 
			
		||||
      // Conserved currents, either contract at sink or insert sequentially.
 | 
			
		||||
      //////////////////////////////////////////////////////////////////////
 | 
			
		||||
      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                            PropagatorField &q_in_2,
 | 
			
		||||
                                            PropagatorField &q_out,
 | 
			
		||||
                                            Current curr_type,
 | 
			
		||||
                                            unsigned int mu)=0;
 | 
			
		||||
      virtual void SeqConservedCurrent(PropagatorField &q_in, 
 | 
			
		||||
                                       PropagatorField &q_out,
 | 
			
		||||
                                       Current curr_type,
 | 
			
		||||
                                       unsigned int mu,
 | 
			
		||||
                                       std::vector<Real> mom,
 | 
			
		||||
                                       unsigned int tmin, 
 | 
			
		||||
                                       unsigned int tmax)=0;
 | 
			
		||||
      ///////////////////////////////////////////////
 | 
			
		||||
      // Physical field import/export
 | 
			
		||||
      ///////////////////////////////////////////////
 | 
			
		||||
      virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; }
 | 
			
		||||
      virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
 | 
			
		||||
      virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
 | 
			
		||||
      {
 | 
			
		||||
	imported = input;
 | 
			
		||||
      };
 | 
			
		||||
      virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
 | 
			
		||||
      {
 | 
			
		||||
	exported=solution;
 | 
			
		||||
      };
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -164,6 +164,7 @@ namespace QCD {
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool isFundamental = Representation::isFundamental;
 | 
			
		||||
    static const bool LsVectorised=false;
 | 
			
		||||
    static const int Nhcs = Options::Nhcs;
 | 
			
		||||
 | 
			
		||||
@@ -212,6 +213,13 @@ namespace QCD {
 | 
			
		||||
                         StencilImpl &St) {
 | 
			
		||||
      mult(&phi(), &U(mu), &chi());
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    inline void multLinkProp(SitePropagator &phi,
 | 
			
		||||
                             const SiteDoubledGaugeField &U,
 | 
			
		||||
                             const SitePropagator &chi,
 | 
			
		||||
                             int mu) {
 | 
			
		||||
       mult(&phi(), &U(mu), &chi());
 | 
			
		||||
    }
 | 
			
		||||
      
 | 
			
		||||
    template <class ref>
 | 
			
		||||
    inline void loadLinkElement(Simd ®, ref &memory) {
 | 
			
		||||
@@ -254,8 +262,22 @@ namespace QCD {
 | 
			
		||||
      GaugeLinkField link(mat._grid);
 | 
			
		||||
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
 | 
			
		||||
      PokeIndex<LorentzIndex>(mat,link,mu);
 | 
			
		||||
    }   
 | 
			
		||||
    }  
 | 
			
		||||
    
 | 
			
		||||
    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
 | 
			
		||||
      mat = outerProduct(B,A); 
 | 
			
		||||
    }  
 | 
			
		||||
 | 
			
		||||
    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
 | 
			
		||||
      mat = TraceIndex<SpinIndex>(P); 
 | 
			
		||||
    }
 | 
			
		||||
      
 | 
			
		||||
    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
 | 
			
		||||
      for (int mu = 0; mu < Nd; mu++)
 | 
			
		||||
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
 | 
			
		||||
      
 | 
			
		||||
      int Ls=Btilde._grid->_fdimensions[0];
 | 
			
		||||
@@ -277,27 +299,28 @@ namespace QCD {
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Single flavour four spinors with colour index, 5d redblack
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
 | 
			
		||||
class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
 | 
			
		||||
template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
 | 
			
		||||
class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > { 
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
 | 
			
		||||
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
 | 
			
		||||
  INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
  static const int Dimension = Nrepresentation;
 | 
			
		||||
  static const int Dimension = Representation::Dimension;
 | 
			
		||||
  static const bool isFundamental = Representation::isFundamental;
 | 
			
		||||
  static const bool LsVectorised=true;
 | 
			
		||||
  static const int Nhcs = Options::Nhcs;
 | 
			
		||||
      
 | 
			
		||||
  typedef typename Options::_Coeff_t Coeff_t;      
 | 
			
		||||
  typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 | 
			
		||||
  
 | 
			
		||||
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
 | 
			
		||||
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
 | 
			
		||||
  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
 | 
			
		||||
  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
 | 
			
		||||
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
 | 
			
		||||
  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
 | 
			
		||||
  
 | 
			
		||||
  typedef iImplSpinor<Simd>            SiteSpinor;
 | 
			
		||||
  typedef iImplPropagator<Simd>        SitePropagator;
 | 
			
		||||
@@ -333,14 +356,27 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
 | 
			
		||||
                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 | 
			
		||||
                       StencilImpl &St) {
 | 
			
		||||
    SiteGaugeLink UU;
 | 
			
		||||
    for (int i = 0; i < Nrepresentation; i++) {
 | 
			
		||||
      for (int j = 0; j < Nrepresentation; j++) {
 | 
			
		||||
    for (int i = 0; i < Dimension; i++) {
 | 
			
		||||
      for (int j = 0; j < Dimension; j++) {
 | 
			
		||||
        vsplat(UU()()(i, j), U(mu)()(i, j));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&phi(), &UU(), &chi());
 | 
			
		||||
  }
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
  inline void multLinkProp(SitePropagator &phi,
 | 
			
		||||
                           const SiteDoubledGaugeField &U,
 | 
			
		||||
                           const SitePropagator &chi,
 | 
			
		||||
                           int mu) {
 | 
			
		||||
    SiteGaugeLink UU;
 | 
			
		||||
    for (int i = 0; i < Dimension; i++) {
 | 
			
		||||
      for (int j = 0; j < Dimension; j++) {
 | 
			
		||||
        vsplat(UU()()(i, j), U(mu)()(i, j));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    mult(&phi(), &UU(), &chi());
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
 | 
			
		||||
  {
 | 
			
		||||
    SiteScalarGaugeField  ScalarUmu;
 | 
			
		||||
@@ -373,6 +409,19 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  } 
 | 
			
		||||
 | 
			
		||||
  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
 | 
			
		||||
 | 
			
		||||
    assert(0);
 | 
			
		||||
@@ -425,25 +474,26 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Flavour doubled spinors; is Gparity the only? what about C*?
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <class S, int Nrepresentation, class Options=CoeffReal>
 | 
			
		||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
 | 
			
		||||
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
 | 
			
		||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
 static const int Dimension = Nrepresentation;
 | 
			
		||||
 static const int Dimension = Representation::Dimension;
 | 
			
		||||
 static const bool isFundamental = Representation::isFundamental;
 | 
			
		||||
 static const int Nhcs = Options::Nhcs;
 | 
			
		||||
 static const bool LsVectorised=false;
 | 
			
		||||
 | 
			
		||||
 typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
 | 
			
		||||
 typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 | 
			
		||||
 INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
 typedef typename Options::_Coeff_t Coeff_t;
 | 
			
		||||
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 | 
			
		||||
      
 | 
			
		||||
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>,  Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
 | 
			
		||||
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
 | 
			
		||||
 | 
			
		||||
 typedef iImplSpinor<Simd>            SiteSpinor;
 | 
			
		||||
 typedef iImplPropagator<Simd>        SitePropagator;
 | 
			
		||||
@@ -537,7 +587,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
   }
 | 
			
		||||
   
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
    // Fixme: Gparity prop * link
 | 
			
		||||
    inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
 | 
			
		||||
                             const SitePropagator &chi, int mu)
 | 
			
		||||
    {
 | 
			
		||||
        assert(0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 template <class ref>
 | 
			
		||||
 inline void loadLinkElement(Simd ®, ref &memory) {
 | 
			
		||||
@@ -611,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
   return;
 | 
			
		||||
 }
 | 
			
		||||
      
 | 
			
		||||
 inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
 | 
			
		||||
   //mat = outerProduct(Btilde, A);
 | 
			
		||||
   assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
 | 
			
		||||
    assert(0);
 | 
			
		||||
    /*
 | 
			
		||||
    auto tmp = TraceIndex<SpinIndex>(P);
 | 
			
		||||
    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
 | 
			
		||||
      mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
 | 
			
		||||
    }
 | 
			
		||||
    */
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
 | 
			
		||||
 | 
			
		||||
   int Ls = Btilde._grid->_fdimensions[0];
 | 
			
		||||
@@ -640,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 | 
			
		||||
 | 
			
		||||
    typedef RealD  _Coeff_t ;
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool isFundamental = Representation::isFundamental;
 | 
			
		||||
    static const bool LsVectorised=false;
 | 
			
		||||
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
 | 
			
		||||
      
 | 
			
		||||
@@ -758,8 +833,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 | 
			
		||||
      GaugeLinkField link(mat._grid);
 | 
			
		||||
      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
 | 
			
		||||
      PokeIndex<LorentzIndex>(mat,link,mu);
 | 
			
		||||
    }   
 | 
			
		||||
      
 | 
			
		||||
    } 
 | 
			
		||||
          
 | 
			
		||||
    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
 | 
			
		||||
      assert (0); 
 | 
			
		||||
      // Must never hit
 | 
			
		||||
@@ -775,6 +850,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
    static const int Dimension = Representation::Dimension;
 | 
			
		||||
    static const bool isFundamental = Representation::isFundamental;
 | 
			
		||||
    static const bool LsVectorised=true;
 | 
			
		||||
    typedef RealD   Coeff_t ;
 | 
			
		||||
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
 | 
			
		||||
@@ -951,29 +1027,33 @@ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation, CoeffReal > Wilso
 | 
			
		||||
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD;  // Double
 | 
			
		||||
 
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
 | 
			
		||||
typedef WilsonImpl<vComplex,  TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR;   // Real.. whichever prec
 | 
			
		||||
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF;  // Float
 | 
			
		||||
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD;  // Double
 | 
			
		||||
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
 | 
			
		||||
 
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
 | 
			
		||||
 
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
 | 
			
		||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
 | 
			
		||||
 
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD;  // Double
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD;  // Double
 | 
			
		||||
 
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 | 
			
		||||
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL;  // Real.. whichever prec
 | 
			
		||||
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH;  // Float
 | 
			
		||||
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF;  // Double
 | 
			
		||||
 | 
			
		||||
typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec
 | 
			
		||||
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float
 | 
			
		||||
 
 | 
			
		||||
@@ -569,6 +569,31 @@ void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////// 
 | 
			
		||||
// Conserved current - not yet implemented.
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                                        PropagatorField &q_in_2,
 | 
			
		||||
                                                        PropagatorField &q_out,
 | 
			
		||||
                                                        Current curr_type,
 | 
			
		||||
                                                        unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
    assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                                                         PropagatorField &q_out,
 | 
			
		||||
                                                         Current curr_type,
 | 
			
		||||
                                                         unsigned int mu, 
 | 
			
		||||
                                                         std::vector<Real> mom,
 | 
			
		||||
                                                         unsigned int tmin,
 | 
			
		||||
                                                         unsigned int tmax)
 | 
			
		||||
{
 | 
			
		||||
    assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
 | 
			
		||||
 | 
			
		||||
  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
 | 
			
		||||
 
 | 
			
		||||
@@ -179,6 +179,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
 | 
			
		||||
 | 
			
		||||
  LebesgueOrder Lebesgue;
 | 
			
		||||
  LebesgueOrder LebesgueEvenOdd;
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Conserved current utilities
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                PropagatorField &q_in_2,
 | 
			
		||||
                                PropagatorField &q_out,
 | 
			
		||||
                                Current curr_type,
 | 
			
		||||
                                unsigned int mu);
 | 
			
		||||
  void SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                           PropagatorField &q_out,
 | 
			
		||||
                           Current curr_type,
 | 
			
		||||
                           unsigned int mu, 
 | 
			
		||||
                           std::vector<Real> mom,
 | 
			
		||||
                           unsigned int tmin,
 | 
			
		||||
                           unsigned int tmax);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
			
		||||
 
 | 
			
		||||
@@ -619,6 +619,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
 | 
			
		||||
  MooeeInv(in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////// 
 | 
			
		||||
// Conserved current - not yet implemented.
 | 
			
		||||
////////////////////////////////////////////////////////
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                                         PropagatorField &q_in_2,
 | 
			
		||||
                                                         PropagatorField &q_out,
 | 
			
		||||
                                                         Current curr_type,
 | 
			
		||||
                                                         unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
    assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                                                          PropagatorField &q_out,
 | 
			
		||||
                                                          Current curr_type,
 | 
			
		||||
                                                          unsigned int mu, 
 | 
			
		||||
                                                          std::vector<Real> mom,
 | 
			
		||||
                                                          unsigned int tmin,
 | 
			
		||||
                                                          unsigned int tmax)
 | 
			
		||||
{
 | 
			
		||||
    assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
 | 
			
		||||
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
 | 
			
		||||
 
 | 
			
		||||
@@ -212,6 +212,21 @@ namespace QCD {
 | 
			
		||||
    // Comms buffer
 | 
			
		||||
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 | 
			
		||||
    
 | 
			
		||||
    ///////////////////////////////////////////////////////////////
 | 
			
		||||
    // Conserved current utilities
 | 
			
		||||
    ///////////////////////////////////////////////////////////////
 | 
			
		||||
    void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                  PropagatorField &q_in_2,
 | 
			
		||||
                                  PropagatorField &q_out,
 | 
			
		||||
                                  Current curr_type,
 | 
			
		||||
                                  unsigned int mu);
 | 
			
		||||
    void SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                             PropagatorField &q_out,
 | 
			
		||||
                             Current curr_type,
 | 
			
		||||
                             unsigned int mu, 
 | 
			
		||||
                             std::vector<Real> mom,
 | 
			
		||||
                             unsigned int tmin,
 | 
			
		||||
                             unsigned int tmax);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -853,7 +853,7 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
              a0 = a0 + incr;
 | 
			
		||||
              a1 = a1 + incr;
 | 
			
		||||
              a2 = a2 + sizeof(Simd::scalar_type);
 | 
			
		||||
              a2 = a2 + sizeof(typename Simd::scalar_type);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -396,6 +396,27 @@ namespace Grid {
 | 
			
		||||
      amax=zolo_hi;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      conformable(solution5d._grid,this->FermionGrid());
 | 
			
		||||
      conformable(exported4d._grid,this->GaugeGrid());
 | 
			
		||||
      ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      conformable(imported5d._grid,this->FermionGrid());
 | 
			
		||||
      conformable(input4d._grid   ,this->GaugeGrid());
 | 
			
		||||
      FermionField tmp(this->FermionGrid());
 | 
			
		||||
      tmp=zero;
 | 
			
		||||
      InsertSlice(input4d, tmp, Ls-1, Ls-1);
 | 
			
		||||
      tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
 | 
			
		||||
      this->Dminus(tmp,imported5d);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
 | 
			
		||||
 
 | 
			
		||||
@@ -70,6 +70,12 @@ namespace Grid {
 | 
			
		||||
      // Efficient support for multigrid coarsening
 | 
			
		||||
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Physical surface field utilities
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
 | 
			
		||||
      virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      PartialFractionFermion5D(GaugeField &_Umu,
 | 
			
		||||
			       GridCartesian         &FiveDimGrid,
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										243
									
								
								lib/qcd/action/fermion/WilsonCloverFermion.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										243
									
								
								lib/qcd/action/fermion/WilsonCloverFermion.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,243 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
    Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/Eigen/Dense>
 | 
			
		||||
#include <Grid/qcd/spin/Dirac.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid
 | 
			
		||||
{
 | 
			
		||||
namespace QCD
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
// *NOT* EO
 | 
			
		||||
template <class Impl>
 | 
			
		||||
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  FermionField temp(out._grid);
 | 
			
		||||
 | 
			
		||||
  // Wilson term
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  this->Dhop(in, out, DaggerNo);
 | 
			
		||||
 | 
			
		||||
  // Clover term
 | 
			
		||||
  Mooee(in, temp);
 | 
			
		||||
 | 
			
		||||
  out += temp;
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  FermionField temp(out._grid);
 | 
			
		||||
 | 
			
		||||
  // Wilson term
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  this->Dhop(in, out, DaggerYes);
 | 
			
		||||
 | 
			
		||||
  // Clover term
 | 
			
		||||
  MooeeDag(in, temp);
 | 
			
		||||
 | 
			
		||||
  out += temp;
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
{
 | 
			
		||||
  WilsonFermion<Impl>::ImportGauge(_Umu);
 | 
			
		||||
  GridBase *grid = _Umu._grid;
 | 
			
		||||
  typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
 | 
			
		||||
 | 
			
		||||
  // Compute the field strength terms mu>nu
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
 | 
			
		||||
  WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
 | 
			
		||||
 | 
			
		||||
  // Compute the Clover Operator acting on Colour and Spin
 | 
			
		||||
  // multiply here by the clover coefficients for the anisotropy
 | 
			
		||||
  CloverTerm  = fillCloverYZ(Bx) * csw_r;
 | 
			
		||||
  CloverTerm += fillCloverXZ(By) * csw_r;
 | 
			
		||||
  CloverTerm += fillCloverXY(Bz) * csw_r;
 | 
			
		||||
  CloverTerm += fillCloverXT(Ex) * csw_t;
 | 
			
		||||
  CloverTerm += fillCloverYT(Ey) * csw_t;
 | 
			
		||||
  CloverTerm += fillCloverZT(Ez) * csw_t;
 | 
			
		||||
  CloverTerm += diag_mass;
 | 
			
		||||
 | 
			
		||||
  int lvol = _Umu._grid->lSites();
 | 
			
		||||
  int DimRep = Impl::Dimension;
 | 
			
		||||
 | 
			
		||||
  Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
 | 
			
		||||
  Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> lcoor;
 | 
			
		||||
  typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
 | 
			
		||||
 | 
			
		||||
  for (int site = 0; site < lvol; site++)
 | 
			
		||||
  {
 | 
			
		||||
    grid->LocalIndexToLocalCoor(site, lcoor);
 | 
			
		||||
    EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
 | 
			
		||||
    peekLocalSite(Qx, CloverTerm, lcoor);
 | 
			
		||||
    Qxinv = zero;
 | 
			
		||||
    //if (csw!=0){
 | 
			
		||||
    for (int j = 0; j < Ns; j++)
 | 
			
		||||
      for (int k = 0; k < Ns; k++)
 | 
			
		||||
        for (int a = 0; a < DimRep; a++)
 | 
			
		||||
          for (int b = 0; b < DimRep; b++)
 | 
			
		||||
            EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
 | 
			
		||||
    //   if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
 | 
			
		||||
 | 
			
		||||
    EigenInvCloverOp = EigenCloverOp.inverse();
 | 
			
		||||
    //std::cout << EigenInvCloverOp << std::endl;
 | 
			
		||||
    for (int j = 0; j < Ns; j++)
 | 
			
		||||
      for (int k = 0; k < Ns; k++)
 | 
			
		||||
        for (int a = 0; a < DimRep; a++)
 | 
			
		||||
          for (int b = 0; b < DimRep; b++)
 | 
			
		||||
            Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
 | 
			
		||||
    //    if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
 | 
			
		||||
    //  }
 | 
			
		||||
    pokeLocalSite(Qxinv, CloverTermInv, lcoor);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Separate the even and odd parts
 | 
			
		||||
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
 | 
			
		||||
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
 | 
			
		||||
  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
 | 
			
		||||
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
 | 
			
		||||
  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  this->MooeeInternal(in, out, DaggerNo, InverseNo);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  this->MooeeInternal(in, out, DaggerYes, InverseNo);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  this->MooeeInternal(in, out, DaggerNo, InverseYes);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  this->MooeeInternal(in, out, DaggerYes, InverseYes);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
 | 
			
		||||
{
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  CloverFieldType *Clover;
 | 
			
		||||
  assert(in.checkerboard == Odd || in.checkerboard == Even);
 | 
			
		||||
 | 
			
		||||
  if (dag)
 | 
			
		||||
  {
 | 
			
		||||
    if (in._grid->_isCheckerBoarded)
 | 
			
		||||
    {
 | 
			
		||||
      if (in.checkerboard == Odd)
 | 
			
		||||
      {
 | 
			
		||||
        Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
 | 
			
		||||
      }
 | 
			
		||||
      else
 | 
			
		||||
      {
 | 
			
		||||
        Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
 | 
			
		||||
      }
 | 
			
		||||
      out = *Clover * in;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
 | 
			
		||||
      out = adj(*Clover) * in;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  else
 | 
			
		||||
  {
 | 
			
		||||
    if (in._grid->_isCheckerBoarded)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      if (in.checkerboard == Odd)
 | 
			
		||||
      {
 | 
			
		||||
        //  std::cout << "Calling clover term Odd" << std::endl;
 | 
			
		||||
        Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
 | 
			
		||||
      }
 | 
			
		||||
      else
 | 
			
		||||
      {
 | 
			
		||||
        //  std::cout << "Calling clover term Even" << std::endl;
 | 
			
		||||
        Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
 | 
			
		||||
      }
 | 
			
		||||
      out = *Clover * in;
 | 
			
		||||
      //  std::cout << GridLogMessage << "*Clover.checkerboard "  << (*Clover).checkerboard << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      Clover = (inv) ? &CloverTermInv : &CloverTerm;
 | 
			
		||||
      out = *Clover * in;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
} // MooeeInternal
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Derivative parts
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Derivative parts
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert(0); // not implemented yet
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonCloverFermion);
 | 
			
		||||
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
 | 
			
		||||
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
 | 
			
		||||
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										366
									
								
								lib/qcd/action/fermion/WilsonCloverFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										366
									
								
								lib/qcd/action/fermion/WilsonCloverFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,366 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
 | 
			
		||||
    Author: David Preti <>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
 | 
			
		||||
#define GRID_QCD_WILSON_CLOVER_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid
 | 
			
		||||
{
 | 
			
		||||
namespace QCD
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Wilson Clover
 | 
			
		||||
//
 | 
			
		||||
// Operator ( with anisotropy coefficients):
 | 
			
		||||
//
 | 
			
		||||
// Q =   1 + (Nd-1)/xi_0 + m
 | 
			
		||||
//     + W_t + (nu/xi_0) * W_s
 | 
			
		||||
//     - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss)  ]
 | 
			
		||||
//
 | 
			
		||||
// s spatial, t temporal directions.
 | 
			
		||||
// where W_t and W_s are the temporal and spatial components of the
 | 
			
		||||
// Wilson Dirac operator
 | 
			
		||||
//
 | 
			
		||||
// csw_r = csw_t to recover the isotropic version
 | 
			
		||||
//////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class WilsonCloverFermion : public WilsonFermion<Impl>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  // Types definitions
 | 
			
		||||
  INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
  template <typename vtype>
 | 
			
		||||
  using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
 | 
			
		||||
  typedef iImplClover<Simd> SiteCloverType;
 | 
			
		||||
  typedef Lattice<SiteCloverType> CloverFieldType;
 | 
			
		||||
 | 
			
		||||
public:
 | 
			
		||||
  typedef WilsonFermion<Impl> WilsonBase;
 | 
			
		||||
 | 
			
		||||
  virtual void Instantiatable(void){};
 | 
			
		||||
  // Constructors
 | 
			
		||||
  WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 | 
			
		||||
                      GridRedBlackCartesian &Hgrid,
 | 
			
		||||
                      const RealD _mass,
 | 
			
		||||
                      const RealD _csw_r = 0.0,
 | 
			
		||||
                      const RealD _csw_t = 0.0,
 | 
			
		||||
                      const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
 | 
			
		||||
                      const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
 | 
			
		||||
                                                                                     Fgrid,
 | 
			
		||||
                                                                                     Hgrid,
 | 
			
		||||
                                                                                     _mass, impl_p, clover_anisotropy),
 | 
			
		||||
                                                                 CloverTerm(&Fgrid),
 | 
			
		||||
                                                                 CloverTermInv(&Fgrid),
 | 
			
		||||
                                                                 CloverTermEven(&Hgrid),
 | 
			
		||||
                                                                 CloverTermOdd(&Hgrid),
 | 
			
		||||
                                                                 CloverTermInvEven(&Hgrid),
 | 
			
		||||
                                                                 CloverTermInvOdd(&Hgrid),
 | 
			
		||||
                                                                 CloverTermDagEven(&Hgrid),
 | 
			
		||||
                                                                 CloverTermDagOdd(&Hgrid),
 | 
			
		||||
                                                                 CloverTermInvDagEven(&Hgrid),
 | 
			
		||||
                                                                 CloverTermInvDagOdd(&Hgrid)
 | 
			
		||||
  {
 | 
			
		||||
    assert(Nd == 4); // require 4 dimensions
 | 
			
		||||
 | 
			
		||||
    if (clover_anisotropy.isAnisotropic)
 | 
			
		||||
    {
 | 
			
		||||
      csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
 | 
			
		||||
      diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      csw_r = _csw_r * 0.5;
 | 
			
		||||
      diag_mass = 4.0 + _mass;
 | 
			
		||||
    }
 | 
			
		||||
    csw_t = _csw_t * 0.5;
 | 
			
		||||
 | 
			
		||||
    if (csw_r == 0)
 | 
			
		||||
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
 | 
			
		||||
    if (csw_t == 0)
 | 
			
		||||
      std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ImportGauge(_Umu);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  virtual RealD M(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual RealD Mdag(const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  virtual void Mooee(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual void MooeeDag(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual void MooeeInv(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual void MooeeInvDag(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
  //virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
  virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
  virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
 | 
			
		||||
 | 
			
		||||
  void ImportGauge(const GaugeField &_Umu);
 | 
			
		||||
 | 
			
		||||
  // Derivative parts unpreconditioned pseudofermions
 | 
			
		||||
  void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
 | 
			
		||||
  {
 | 
			
		||||
    conformable(X._grid, Y._grid);
 | 
			
		||||
    conformable(X._grid, force._grid);
 | 
			
		||||
    GaugeLinkField force_mu(force._grid), lambda(force._grid);
 | 
			
		||||
    GaugeField clover_force(force._grid);
 | 
			
		||||
    PropagatorField Lambda(force._grid);
 | 
			
		||||
 | 
			
		||||
    // Guido: Here we are hitting some performance issues:
 | 
			
		||||
    // need to extract the components of the DoubledGaugeField
 | 
			
		||||
    // for each call
 | 
			
		||||
    // Possible solution
 | 
			
		||||
    // Create a vector object to store them? (cons: wasting space)
 | 
			
		||||
    std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
 | 
			
		||||
 | 
			
		||||
    Impl::extractLinkField(U, this->Umu);
 | 
			
		||||
 | 
			
		||||
    force = zero;
 | 
			
		||||
    // Derivative of the Wilson hopping term
 | 
			
		||||
    this->DhopDeriv(force, X, Y, dag);
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    // Clover term derivative
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    Impl::outerProductImpl(Lambda, X, Y);
 | 
			
		||||
    //std::cout << "Lambda:" << Lambda << std::endl;
 | 
			
		||||
 | 
			
		||||
    Gamma::Algebra sigma[] = {
 | 
			
		||||
        Gamma::Algebra::SigmaXY,
 | 
			
		||||
        Gamma::Algebra::SigmaXZ,
 | 
			
		||||
        Gamma::Algebra::SigmaXT,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaXY,
 | 
			
		||||
        Gamma::Algebra::SigmaYZ,
 | 
			
		||||
        Gamma::Algebra::SigmaYT,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaXZ,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaYZ,
 | 
			
		||||
        Gamma::Algebra::SigmaZT,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaXT,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaYT,
 | 
			
		||||
        Gamma::Algebra::MinusSigmaZT};
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
      sigma_{\mu \nu}=
 | 
			
		||||
      | 0         sigma[0]  sigma[1]  sigma[2] |
 | 
			
		||||
      | sigma[3]    0       sigma[4]  sigma[5] |
 | 
			
		||||
      | sigma[6]  sigma[7]     0      sigma[8] |
 | 
			
		||||
      | sigma[9]  sigma[10] sigma[11]   0      |
 | 
			
		||||
    */
 | 
			
		||||
 | 
			
		||||
    int count = 0;
 | 
			
		||||
    clover_force = zero;
 | 
			
		||||
    for (int mu = 0; mu < 4; mu++)
 | 
			
		||||
    {
 | 
			
		||||
      force_mu = zero;
 | 
			
		||||
      for (int nu = 0; nu < 4; nu++)
 | 
			
		||||
      {
 | 
			
		||||
        if (mu == nu)
 | 
			
		||||
        continue;
 | 
			
		||||
        
 | 
			
		||||
        RealD factor;
 | 
			
		||||
        if (nu == 4 || mu == 4)
 | 
			
		||||
        {
 | 
			
		||||
          factor = 2.0 * csw_t;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          factor = 2.0 * csw_r;
 | 
			
		||||
        }
 | 
			
		||||
        PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
 | 
			
		||||
        Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
 | 
			
		||||
        force_mu -= factor*Cmunu(U, lambda, mu, nu);                   // checked
 | 
			
		||||
        count++;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      pokeLorentz(clover_force, U[mu] * force_mu, mu);
 | 
			
		||||
    }
 | 
			
		||||
    //clover_force *= csw;
 | 
			
		||||
    force += clover_force;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
 | 
			
		||||
  GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
 | 
			
		||||
  {
 | 
			
		||||
    conformable(lambda._grid, U[0]._grid);
 | 
			
		||||
    GaugeLinkField out(lambda._grid), tmp(lambda._grid);
 | 
			
		||||
    // insertion in upper staple
 | 
			
		||||
    // please check redundancy of shift operations
 | 
			
		||||
 | 
			
		||||
    // C1+
 | 
			
		||||
    tmp = lambda * U[nu];
 | 
			
		||||
    out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
 | 
			
		||||
 | 
			
		||||
    // C2+
 | 
			
		||||
    tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
 | 
			
		||||
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
 | 
			
		||||
 | 
			
		||||
    // C3+
 | 
			
		||||
    tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
 | 
			
		||||
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
 | 
			
		||||
 | 
			
		||||
    // C4+
 | 
			
		||||
    out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
 | 
			
		||||
 | 
			
		||||
    // insertion in lower staple
 | 
			
		||||
    // C1-
 | 
			
		||||
    out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
 | 
			
		||||
 | 
			
		||||
    // C2-
 | 
			
		||||
    tmp = adj(lambda) * U[nu];
 | 
			
		||||
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
 | 
			
		||||
 | 
			
		||||
    // C3-
 | 
			
		||||
    tmp = lambda * U[nu];
 | 
			
		||||
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
 | 
			
		||||
 | 
			
		||||
    // C4-
 | 
			
		||||
    out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
 | 
			
		||||
 | 
			
		||||
    return out;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
  // here fixing the 4 dimensions, make it more general?
 | 
			
		||||
 | 
			
		||||
  RealD csw_r;                                               // Clover coefficient - spatial
 | 
			
		||||
  RealD csw_t;                                               // Clover coefficient - temporal
 | 
			
		||||
  RealD diag_mass;                                           // Mass term
 | 
			
		||||
  CloverFieldType CloverTerm, CloverTermInv;                 // Clover term
 | 
			
		||||
  CloverFieldType CloverTermEven, CloverTermOdd;             // Clover term EO
 | 
			
		||||
  CloverFieldType CloverTermInvEven, CloverTermInvOdd;       // Clover term Inv EO
 | 
			
		||||
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
 | 
			
		||||
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 | 
			
		||||
 | 
			
		||||
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
 | 
			
		||||
  // using the DeGrand-Rossi basis for the gamma matrices
 | 
			
		||||
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
      T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CloverFieldType fillCloverXZ(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
      T._odata[i]()(0, 1) = -F._odata[i]()();
 | 
			
		||||
      T._odata[i]()(1, 0) = F._odata[i]()();
 | 
			
		||||
      T._odata[i]()(2, 3) = -F._odata[i]()();
 | 
			
		||||
      T._odata[i]()(3, 2) = F._odata[i]()();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CloverFieldType fillCloverXY(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(1, 1) = timesI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CloverFieldType fillCloverXT(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
      T._odata[i]()(0, 1) = timesI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(1, 0) = timesI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CloverFieldType fillCloverYT(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
      T._odata[i]()(0, 1) = -(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(1, 0) = (F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(2, 3) = (F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(3, 2) = -(F._odata[i]()());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CloverFieldType fillCloverZT(const GaugeLinkField &F)
 | 
			
		||||
  {
 | 
			
		||||
    CloverFieldType T(F._grid);
 | 
			
		||||
    T = zero;
 | 
			
		||||
    PARALLEL_FOR_LOOP
 | 
			
		||||
    for (int i = 0; i < CloverTerm._grid->oSites(); i++)
 | 
			
		||||
    {
 | 
			
		||||
      T._odata[i]()(0, 0) = timesI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
 | 
			
		||||
      T._odata[i]()(3, 3) = timesI(F._odata[i]()());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return T;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
 | 
			
		||||
@@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
 | 
			
		||||
    projector::Proj(buf[o],in,mu,dag);
 | 
			
		||||
  inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
 | 
			
		||||
    SiteHalfSpinor tmp;
 | 
			
		||||
    projector::Proj(tmp,in,mu,dag);
 | 
			
		||||
    vstream(buf[o],tmp);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Exchange includes precision change if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Exchange(SiteHalfSpinor *mp,
 | 
			
		||||
                       SiteHalfSpinor *vp0,
 | 
			
		||||
                       SiteHalfSpinor *vp1,
 | 
			
		||||
  inline void Exchange(SiteHalfSpinor * __restrict__ mp,
 | 
			
		||||
                       const SiteHalfSpinor * __restrict__ vp0,
 | 
			
		||||
                       const SiteHalfSpinor * __restrict__ vp1,
 | 
			
		||||
		       Integer type,Integer o){
 | 
			
		||||
    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
 | 
			
		||||
    SiteHalfSpinor tmp1;
 | 
			
		||||
    SiteHalfSpinor tmp2;
 | 
			
		||||
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
 | 
			
		||||
    vstream(mp[2*o  ],tmp1);
 | 
			
		||||
    vstream(mp[2*o+1],tmp2);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Have a decompression step if mpi data is not same */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void Decompress(SiteHalfSpinor *out,
 | 
			
		||||
			 SiteHalfSpinor *in, Integer o) {    
 | 
			
		||||
  inline void Decompress(SiteHalfSpinor * __restrict__ out,
 | 
			
		||||
			 SiteHalfSpinor * __restrict__ in, Integer o) {    
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  /* Compress Exchange                                 */
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
  inline void CompressExchange(SiteHalfSpinor *out0,
 | 
			
		||||
			       SiteHalfSpinor *out1,
 | 
			
		||||
			       const SiteSpinor *in,
 | 
			
		||||
  inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
 | 
			
		||||
			       SiteHalfSpinor * __restrict__ out1,
 | 
			
		||||
			       const SiteSpinor * __restrict__ in,
 | 
			
		||||
			       Integer j,Integer k, Integer m,Integer type){
 | 
			
		||||
    SiteHalfSpinor temp1, temp2,temp3,temp4;
 | 
			
		||||
    projector::Proj(temp1,in[k],mu,dag);
 | 
			
		||||
    projector::Proj(temp2,in[m],mu,dag);
 | 
			
		||||
    exchange(out0[j],out1[j],temp1,temp2,type);
 | 
			
		||||
    exchange(temp3,temp4,temp1,temp2,type);
 | 
			
		||||
    vstream(out0[j],temp3);
 | 
			
		||||
    vstream(out1[j],temp4);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /*****************************************************/
 | 
			
		||||
@@ -265,7 +273,6 @@ public:
 | 
			
		||||
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
 | 
			
		||||
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  WilsonStencil(GridBase *grid,
 | 
			
		||||
		int npoints,
 | 
			
		||||
 
 | 
			
		||||
@@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
 | 
			
		||||
template <class Impl>
 | 
			
		||||
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 | 
			
		||||
                                   GridRedBlackCartesian &Hgrid, RealD _mass,
 | 
			
		||||
                                   const ImplParams &p)
 | 
			
		||||
                                   const ImplParams &p,
 | 
			
		||||
                                   const WilsonAnisotropyCoefficients &anis)
 | 
			
		||||
    : Kernels(p),
 | 
			
		||||
      _grid(&Fgrid),
 | 
			
		||||
      _cbgrid(&Hgrid),
 | 
			
		||||
@@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 | 
			
		||||
      Umu(&Fgrid),
 | 
			
		||||
      UmuEven(&Hgrid),
 | 
			
		||||
      UmuOdd(&Hgrid),
 | 
			
		||||
      _tmp(&Hgrid)
 | 
			
		||||
      _tmp(&Hgrid),
 | 
			
		||||
      anisotropyCoeff(anis)
 | 
			
		||||
{
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
  if  (anisotropyCoeff.isAnisotropic){
 | 
			
		||||
    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
 | 
			
		||||
  } else {
 | 
			
		||||
    diag_mass = 4.0 + mass;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
 | 
			
		||||
  GaugeField HUmu(_Umu._grid);
 | 
			
		||||
  HUmu = _Umu * (-0.5);
 | 
			
		||||
 | 
			
		||||
  //Here multiply the anisotropy coefficients
 | 
			
		||||
  if (anisotropyCoeff.isAnisotropic)
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    for (int mu = 0; mu < Nd; mu++)
 | 
			
		||||
    {
 | 
			
		||||
      GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
 | 
			
		||||
      if (mu != anisotropyCoeff.t_direction)
 | 
			
		||||
        U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
 | 
			
		||||
 | 
			
		||||
      PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  else
 | 
			
		||||
  {
 | 
			
		||||
    HUmu = _Umu * (-0.5);
 | 
			
		||||
  }
 | 
			
		||||
  Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
 | 
			
		||||
  pickCheckerboard(Even, UmuEven, Umu);
 | 
			
		||||
  pickCheckerboard(Odd, UmuOdd, Umu);
 | 
			
		||||
@@ -83,14 +109,14 @@ template <class Impl>
 | 
			
		||||
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  Dhop(in, out, DaggerNo);
 | 
			
		||||
  return axpy_norm(out, 4 + mass, in, out);
 | 
			
		||||
  return axpy_norm(out, diag_mass, in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  Dhop(in, out, DaggerYes);
 | 
			
		||||
  return axpy_norm(out, 4 + mass, in, out);
 | 
			
		||||
  return axpy_norm(out, diag_mass, in, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
@@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  typename FermionField::scalar_type scal(4.0 + mass);
 | 
			
		||||
  typename FermionField::scalar_type scal(diag_mass);
 | 
			
		||||
  out = scal * in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  out = (1.0/(diag_mass))*in;
 | 
			
		||||
}
 | 
			
		||||
  
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
 | 
			
		||||
 | 
			
		||||
  FermionField Btilde(B._grid);
 | 
			
		||||
  FermionField Atilde(B._grid);
 | 
			
		||||
  Atilde = A;
 | 
			
		||||
  Atilde = A;//redundant
 | 
			
		||||
 | 
			
		||||
  st.HaloExchange(B, compressor);
 | 
			
		||||
 | 
			
		||||
@@ -429,6 +455,112 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
 | 
			
		||||
};
 | 
			
		||||
/*Change ends */
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Conserved current utilities for Wilson fermions, for contracting propagators
 | 
			
		||||
 * to make a conserved current sink or inserting the conserved current 
 | 
			
		||||
 * sequentially.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                                   PropagatorField &q_in_2,
 | 
			
		||||
                                                   PropagatorField &q_out,
 | 
			
		||||
                                                   Current curr_type,
 | 
			
		||||
                                                   unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
    conformable(_grid, q_in_1._grid);
 | 
			
		||||
    conformable(_grid, q_in_2._grid);
 | 
			
		||||
    conformable(_grid, q_out._grid);
 | 
			
		||||
    PropagatorField tmp1(_grid), tmp2(_grid);
 | 
			
		||||
    q_out = zero;
 | 
			
		||||
 | 
			
		||||
    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
 | 
			
		||||
    // Inefficient comms method but not performance critical.
 | 
			
		||||
    tmp1 = Cshift(q_in_1, mu, 1);
 | 
			
		||||
    tmp2 = Cshift(q_in_2, mu, 1);
 | 
			
		||||
    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
 | 
			
		||||
    {
 | 
			
		||||
        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
 | 
			
		||||
                                                 q_in_2._odata[sU],
 | 
			
		||||
                                                 q_out._odata[sU],
 | 
			
		||||
                                                 Umu, sU, mu);
 | 
			
		||||
        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
 | 
			
		||||
                                                 tmp2._odata[sU],
 | 
			
		||||
                                                 q_out._odata[sU],
 | 
			
		||||
                                                 Umu, sU, mu);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
 | 
			
		||||
                                              PropagatorField &q_out,
 | 
			
		||||
                                              Current curr_type,
 | 
			
		||||
                                              unsigned int mu,
 | 
			
		||||
                                              std::vector<Real> mom,
 | 
			
		||||
                                              unsigned int tmin, 
 | 
			
		||||
                                              unsigned int tmax)
 | 
			
		||||
{
 | 
			
		||||
    conformable(_grid, q_in._grid);
 | 
			
		||||
    conformable(_grid, q_out._grid);
 | 
			
		||||
    Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
 | 
			
		||||
    ComplexD i(0.0,1.0);
 | 
			
		||||
    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
 | 
			
		||||
    unsigned int tshift = (mu == Tp) ? 1 : 0;
 | 
			
		||||
    unsigned int LLt    = GridDefaultLatt()[Tp];
 | 
			
		||||
 | 
			
		||||
    // Momentum projection
 | 
			
		||||
    ph = zero;
 | 
			
		||||
    for(unsigned int mu = 0; mu < Nd - 1; mu++)
 | 
			
		||||
    {
 | 
			
		||||
        LatticeCoordinate(coor, mu);
 | 
			
		||||
        ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
 | 
			
		||||
    }
 | 
			
		||||
    ph = exp((RealD)(2*M_PI)*i*ph);
 | 
			
		||||
 | 
			
		||||
    q_out = zero;
 | 
			
		||||
    LatticeInteger coords(_grid);
 | 
			
		||||
    LatticeCoordinate(coords, Tp);
 | 
			
		||||
 | 
			
		||||
    // Need q(x + mu) and q(x - mu).
 | 
			
		||||
    tmp = Cshift(q_in, mu, 1);
 | 
			
		||||
    tmpFwd = tmp*ph;
 | 
			
		||||
    tmp = ph*q_in;
 | 
			
		||||
    tmpBwd = Cshift(tmp, mu, -1);
 | 
			
		||||
 | 
			
		||||
    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
 | 
			
		||||
    {
 | 
			
		||||
        // Compute the sequential conserved current insertion only if our simd
 | 
			
		||||
        // object contains a timeslice we need.
 | 
			
		||||
        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
 | 
			
		||||
                             (coords._odata[sU] <= tmax));
 | 
			
		||||
        Integer timeSlices = Reduce(t_mask);
 | 
			
		||||
 | 
			
		||||
        if (timeSlices > 0)
 | 
			
		||||
        {
 | 
			
		||||
            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
 | 
			
		||||
                                                q_out._odata[sU], 
 | 
			
		||||
                                                Umu, sU, mu, t_mask);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Repeat for backward direction.
 | 
			
		||||
        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
 | 
			
		||||
                      (coords._odata[sU] <= (tmax + tshift)));
 | 
			
		||||
 | 
			
		||||
	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
 | 
			
		||||
	unsigned int t0 = 0;
 | 
			
		||||
	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
 | 
			
		||||
 | 
			
		||||
        timeSlices = Reduce(t_mask);
 | 
			
		||||
 | 
			
		||||
        if (timeSlices > 0)
 | 
			
		||||
        {
 | 
			
		||||
            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
 | 
			
		||||
                                                q_out._odata[sU], 
 | 
			
		||||
                                                Umu, sU, mu, t_mask);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
AdjointFermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
 
 | 
			
		||||
@@ -44,6 +44,21 @@ class WilsonFermionStatic {
 | 
			
		||||
  static const int npoint = 8;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 struct WilsonAnisotropyCoefficients: Serializable
 | 
			
		||||
 {
 | 
			
		||||
  GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
 | 
			
		||||
  bool, isAnisotropic,
 | 
			
		||||
  int, t_direction,
 | 
			
		||||
  double, xi_0,
 | 
			
		||||
  double, nu);
 | 
			
		||||
 | 
			
		||||
  WilsonAnisotropyCoefficients():
 | 
			
		||||
    isAnisotropic(false), 
 | 
			
		||||
    t_direction(Nd-1), 
 | 
			
		||||
    xi_0(1.0), 
 | 
			
		||||
    nu(1.0){}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 | 
			
		||||
 public:
 | 
			
		||||
@@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 | 
			
		||||
  // override multiply; cut number routines if pass dagger argument
 | 
			
		||||
  // and also make interface more uniformly consistent
 | 
			
		||||
  //////////////////////////////////////////////////////////////////
 | 
			
		||||
  RealD M(const FermionField &in, FermionField &out);
 | 
			
		||||
  RealD Mdag(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual RealD M(const FermionField &in, FermionField &out);
 | 
			
		||||
  virtual RealD Mdag(const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  // half checkerboard operations
 | 
			
		||||
@@ -123,8 +138,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 | 
			
		||||
 | 
			
		||||
  // Constructor
 | 
			
		||||
  WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
 | 
			
		||||
                GridRedBlackCartesian &Hgrid, RealD _mass,
 | 
			
		||||
                const ImplParams &p = ImplParams());
 | 
			
		||||
                GridRedBlackCartesian &Hgrid, RealD _mass, 
 | 
			
		||||
                const ImplParams &p = ImplParams(), 
 | 
			
		||||
                const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
 | 
			
		||||
 | 
			
		||||
  // DoubleStore impl dependent
 | 
			
		||||
  void ImportGauge(const GaugeField &_Umu);
 | 
			
		||||
@@ -138,6 +154,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 | 
			
		||||
  virtual RealD Mass(void) { return mass; }
 | 
			
		||||
  virtual int   isTrivialEE(void) { return 1; };
 | 
			
		||||
  RealD mass;
 | 
			
		||||
  RealD diag_mass;
 | 
			
		||||
 | 
			
		||||
  GridBase *_grid;
 | 
			
		||||
  GridBase *_cbgrid;
 | 
			
		||||
@@ -154,6 +171,24 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
 | 
			
		||||
 | 
			
		||||
  LebesgueOrder Lebesgue;
 | 
			
		||||
  LebesgueOrder LebesgueEvenOdd;
 | 
			
		||||
 | 
			
		||||
  WilsonAnisotropyCoefficients anisotropyCoeff;
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  // Conserved current utilities
 | 
			
		||||
  ///////////////////////////////////////////////////////////////
 | 
			
		||||
  void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                PropagatorField &q_in_2,
 | 
			
		||||
                                PropagatorField &q_out,
 | 
			
		||||
                                Current curr_type,
 | 
			
		||||
                                unsigned int mu);
 | 
			
		||||
  void SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                           PropagatorField &q_out,
 | 
			
		||||
                           Current curr_type,
 | 
			
		||||
                           unsigned int mu, 
 | 
			
		||||
                           std::vector<Real> mom,
 | 
			
		||||
                           unsigned int tmin,
 | 
			
		||||
                           unsigned int tmax);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
 | 
			
		||||
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
@@ -701,6 +702,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Conserved current utilities for Wilson fermions, for contracting propagators
 | 
			
		||||
 * to make a conserved current sink or inserting the conserved current 
 | 
			
		||||
 * sequentially.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
 | 
			
		||||
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
 | 
			
		||||
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
 | 
			
		||||
{ \
 | 
			
		||||
    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
 | 
			
		||||
    extract(qSite, qSiteVec); \
 | 
			
		||||
    for (int i = 0; i < Nsimd / 2; ++i) \
 | 
			
		||||
    { \
 | 
			
		||||
        typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
 | 
			
		||||
        qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
 | 
			
		||||
        qSiteVec[Nsimd - i - 1] = tmp; \
 | 
			
		||||
    } \
 | 
			
		||||
    merge(qSiteRev, qSiteVec); \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                                     PropagatorField &q_in_2,
 | 
			
		||||
                                                     PropagatorField &q_out,
 | 
			
		||||
                                                     Current curr_type,
 | 
			
		||||
                                                     unsigned int mu)
 | 
			
		||||
{
 | 
			
		||||
    conformable(q_in_1._grid, FermionGrid());
 | 
			
		||||
    conformable(q_in_1._grid, q_in_2._grid);
 | 
			
		||||
    conformable(_FourDimGrid, q_out._grid);
 | 
			
		||||
    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
 | 
			
		||||
    unsigned int LLs = q_in_1._grid->_rdimensions[0];
 | 
			
		||||
    q_out = zero;
 | 
			
		||||
 | 
			
		||||
    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
 | 
			
		||||
    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
 | 
			
		||||
    tmp1 = Cshift(q_in_1, mu + 1, 1);
 | 
			
		||||
    tmp2 = Cshift(q_in_2, mu + 1, 1);
 | 
			
		||||
    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
 | 
			
		||||
    {
 | 
			
		||||
        unsigned int sF1 = sU * LLs;
 | 
			
		||||
        unsigned int sF2 = (sU + 1) * LLs - 1;
 | 
			
		||||
 | 
			
		||||
        for (unsigned int s = 0; s < LLs; ++s)
 | 
			
		||||
        {
 | 
			
		||||
            bool axial_sign = ((curr_type == Current::Axial) && \
 | 
			
		||||
                               (s < (LLs / 2)));
 | 
			
		||||
            SitePropagator qSite2, qmuSite2;
 | 
			
		||||
 | 
			
		||||
            // If vectorised in 5th dimension, reverse q2 vector to match up
 | 
			
		||||
            // sites correctly.
 | 
			
		||||
            if (Impl::LsVectorised)
 | 
			
		||||
            {
 | 
			
		||||
                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
 | 
			
		||||
                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                qSite2   = q_in_2._odata[sF2];
 | 
			
		||||
                qmuSite2 = tmp2._odata[sF2];
 | 
			
		||||
            }
 | 
			
		||||
            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
 | 
			
		||||
                                                     qSite2, 
 | 
			
		||||
                                                     q_out._odata[sU],
 | 
			
		||||
                                                     Umu, sU, mu, axial_sign);
 | 
			
		||||
            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
 | 
			
		||||
                                                     qmuSite2,
 | 
			
		||||
                                                     q_out._odata[sU],
 | 
			
		||||
                                                     Umu, sU, mu, axial_sign);
 | 
			
		||||
            sF1++;
 | 
			
		||||
            sF2--;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
 | 
			
		||||
                                                PropagatorField &q_out,
 | 
			
		||||
                                                Current curr_type, 
 | 
			
		||||
                                                unsigned int mu,
 | 
			
		||||
                                                std::vector<Real> mom,
 | 
			
		||||
                                                unsigned int tmin, 
 | 
			
		||||
                                                unsigned int tmax)
 | 
			
		||||
{
 | 
			
		||||
    conformable(q_in._grid, FermionGrid());
 | 
			
		||||
    conformable(q_in._grid, q_out._grid);
 | 
			
		||||
    Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
 | 
			
		||||
    PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
 | 
			
		||||
                    tmp(FermionGrid());
 | 
			
		||||
    ComplexD i(0.0, 1.0);
 | 
			
		||||
    unsigned int tshift = (mu == Tp) ? 1 : 0;
 | 
			
		||||
    unsigned int LLs = q_in._grid->_rdimensions[0];
 | 
			
		||||
    unsigned int LLt    = GridDefaultLatt()[Tp];
 | 
			
		||||
 | 
			
		||||
    // Momentum projection.
 | 
			
		||||
    ph = zero;
 | 
			
		||||
    for(unsigned int nu = 0; nu < Nd - 1; nu++)
 | 
			
		||||
    {
 | 
			
		||||
        // Shift coordinate lattice index by 1 to account for 5th dimension.
 | 
			
		||||
        LatticeCoordinate(coor, nu + 1);
 | 
			
		||||
        ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
 | 
			
		||||
    }
 | 
			
		||||
    ph = exp((RealD)(2*M_PI)*i*ph);
 | 
			
		||||
 | 
			
		||||
    q_out = zero;
 | 
			
		||||
    LatticeInteger coords(_FourDimGrid);
 | 
			
		||||
    LatticeCoordinate(coords, Tp);
 | 
			
		||||
 | 
			
		||||
    // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
 | 
			
		||||
    // by one.
 | 
			
		||||
    tmp = Cshift(q_in, mu + 1, 1);
 | 
			
		||||
    tmpFwd = tmp*ph;
 | 
			
		||||
    tmp = ph*q_in;
 | 
			
		||||
    tmpBwd = Cshift(tmp, mu + 1, -1);
 | 
			
		||||
 | 
			
		||||
    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
 | 
			
		||||
    {
 | 
			
		||||
        // Compute the sequential conserved current insertion only if our simd
 | 
			
		||||
        // object contains a timeslice we need.
 | 
			
		||||
        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
 | 
			
		||||
                             (coords._odata[sU] <= tmax));
 | 
			
		||||
        Integer timeSlices = Reduce(t_mask);
 | 
			
		||||
 | 
			
		||||
        if (timeSlices > 0)
 | 
			
		||||
        {
 | 
			
		||||
            unsigned int sF = sU * LLs;
 | 
			
		||||
            for (unsigned int s = 0; s < LLs; ++s)
 | 
			
		||||
            {
 | 
			
		||||
                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
 | 
			
		||||
                Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF], 
 | 
			
		||||
                                                    q_out._odata[sF], Umu, sU,
 | 
			
		||||
                                                    mu, t_mask, axial_sign);
 | 
			
		||||
                ++sF;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Repeat for backward direction.
 | 
			
		||||
        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
 | 
			
		||||
                      (coords._odata[sU] <= (tmax + tshift)));
 | 
			
		||||
 | 
			
		||||
	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
 | 
			
		||||
	unsigned int t0 = 0;
 | 
			
		||||
	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
 | 
			
		||||
 | 
			
		||||
        timeSlices = Reduce(t_mask);
 | 
			
		||||
 | 
			
		||||
        if (timeSlices > 0)
 | 
			
		||||
        {
 | 
			
		||||
            unsigned int sF = sU * LLs;
 | 
			
		||||
            for (unsigned int s = 0; s < LLs; ++s)
 | 
			
		||||
            {
 | 
			
		||||
                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
 | 
			
		||||
                Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF], 
 | 
			
		||||
                                                    q_out._odata[sF], Umu, sU,
 | 
			
		||||
                                                    mu, t_mask, axial_sign);
 | 
			
		||||
                ++sF;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
@@ -214,6 +214,21 @@ namespace QCD {
 | 
			
		||||
    // Comms buffer
 | 
			
		||||
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
 | 
			
		||||
    
 | 
			
		||||
    ///////////////////////////////////////////////////////////////
 | 
			
		||||
    // Conserved current utilities
 | 
			
		||||
    ///////////////////////////////////////////////////////////////
 | 
			
		||||
    void ContractConservedCurrent(PropagatorField &q_in_1,
 | 
			
		||||
                                  PropagatorField &q_in_2,
 | 
			
		||||
                                  PropagatorField &q_out,
 | 
			
		||||
                                  Current curr_type, 
 | 
			
		||||
                                  unsigned int mu);
 | 
			
		||||
    void SeqConservedCurrent(PropagatorField &q_in,
 | 
			
		||||
                             PropagatorField &q_out,
 | 
			
		||||
                             Current curr_type,
 | 
			
		||||
                             unsigned int mu,
 | 
			
		||||
                             std::vector<Real> mom,
 | 
			
		||||
                             unsigned int tmin,
 | 
			
		||||
                             unsigned int tmax);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
 | 
			
		||||
  vstream(out._odata[sF], result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Conserved current utilities for Wilson fermions, for contracting propagators
 | 
			
		||||
 * to make a conserved current sink or inserting the conserved current 
 | 
			
		||||
 * sequentially. Common to both 4D and 5D.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
// N.B. Functions below assume a -1/2 factor within U.
 | 
			
		||||
#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
 | 
			
		||||
#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Name: ContractConservedCurrentSiteFwd
 | 
			
		||||
 * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
 | 
			
		||||
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 | 
			
		||||
 *        - Pass in q_in_1 shifted in +ve mu direction.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
 | 
			
		||||
                                                  const SitePropagator &q_in_1,
 | 
			
		||||
                                                  const SitePropagator &q_in_2,
 | 
			
		||||
                                                  SitePropagator &q_out,
 | 
			
		||||
                                                  DoubledGaugeField &U,
 | 
			
		||||
                                                  unsigned int sU,
 | 
			
		||||
                                                  unsigned int mu,
 | 
			
		||||
                                                  bool switch_sign)
 | 
			
		||||
{
 | 
			
		||||
    SitePropagator result, tmp;
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
 | 
			
		||||
    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
 | 
			
		||||
    if (switch_sign)
 | 
			
		||||
    {
 | 
			
		||||
        q_out -= result;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        q_out += result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Name: ContractConservedCurrentSiteBwd
 | 
			
		||||
 * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
 | 
			
		||||
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 | 
			
		||||
 *        - Pass in q_in_2 shifted in +ve mu direction.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
 | 
			
		||||
                                                  const SitePropagator &q_in_1,
 | 
			
		||||
                                                  const SitePropagator &q_in_2,
 | 
			
		||||
                                                  SitePropagator &q_out,
 | 
			
		||||
                                                  DoubledGaugeField &U,
 | 
			
		||||
                                                  unsigned int sU,
 | 
			
		||||
                                                  unsigned int mu,
 | 
			
		||||
                                                  bool switch_sign)
 | 
			
		||||
{
 | 
			
		||||
    SitePropagator result, tmp;
 | 
			
		||||
    Gamma g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
 | 
			
		||||
    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
 | 
			
		||||
    if (switch_sign)
 | 
			
		||||
    {
 | 
			
		||||
        q_out += result;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        q_out -= result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// G-parity requires more specialised implementation.
 | 
			
		||||
#define NO_CURR_SITE(Impl) \
 | 
			
		||||
template <> \
 | 
			
		||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
 | 
			
		||||
                                                  const SitePropagator &q_in_1, \
 | 
			
		||||
                                                  const SitePropagator &q_in_2, \
 | 
			
		||||
                                                  SitePropagator &q_out,        \
 | 
			
		||||
                                                  DoubledGaugeField &U,         \
 | 
			
		||||
                                                  unsigned int sU,              \
 | 
			
		||||
                                                  unsigned int mu,              \
 | 
			
		||||
                                                  bool switch_sign)             \
 | 
			
		||||
{ \
 | 
			
		||||
    assert(0); \
 | 
			
		||||
} \
 | 
			
		||||
template <> \
 | 
			
		||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
 | 
			
		||||
                                                  const SitePropagator &q_in_1, \
 | 
			
		||||
                                                  const SitePropagator &q_in_2, \
 | 
			
		||||
                                                  SitePropagator &q_out,        \
 | 
			
		||||
                                                  DoubledGaugeField &U,         \
 | 
			
		||||
                                                  unsigned int mu,              \
 | 
			
		||||
                                                  unsigned int sU,              \
 | 
			
		||||
                                                  bool switch_sign)             \
 | 
			
		||||
{ \
 | 
			
		||||
    assert(0); \
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NO_CURR_SITE(GparityWilsonImplF);
 | 
			
		||||
NO_CURR_SITE(GparityWilsonImplD);
 | 
			
		||||
NO_CURR_SITE(GparityWilsonImplFH);
 | 
			
		||||
NO_CURR_SITE(GparityWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Name: SeqConservedCurrentSiteFwd
 | 
			
		||||
 * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
 | 
			
		||||
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 | 
			
		||||
 *        - Pass in q_in shifted in +ve mu direction.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
 | 
			
		||||
                                                     SitePropagator &q_out,
 | 
			
		||||
                                                     DoubledGaugeField &U,
 | 
			
		||||
                                                     unsigned int sU,
 | 
			
		||||
                                                     unsigned int mu,
 | 
			
		||||
                                                     vInteger t_mask,
 | 
			
		||||
                                                     bool switch_sign)
 | 
			
		||||
{
 | 
			
		||||
    SitePropagator result;
 | 
			
		||||
    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
 | 
			
		||||
    result = WilsonCurrentFwd(result, mu);
 | 
			
		||||
 | 
			
		||||
    // Zero any unwanted timeslice entries.
 | 
			
		||||
    result = predicatedWhere(t_mask, result, 0.*result);
 | 
			
		||||
 | 
			
		||||
    if (switch_sign)
 | 
			
		||||
    {
 | 
			
		||||
        q_out -= result;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        q_out += result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*******************************************************************************
 | 
			
		||||
 * Name: SeqConservedCurrentSiteFwd
 | 
			
		||||
 * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
 | 
			
		||||
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 | 
			
		||||
 *        - Pass in q_in shifted in -ve mu direction.
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
 | 
			
		||||
                                                     SitePropagator &q_out,
 | 
			
		||||
                                                     DoubledGaugeField &U,
 | 
			
		||||
                                                     unsigned int sU,
 | 
			
		||||
                                                     unsigned int mu,
 | 
			
		||||
                                                     vInteger t_mask,
 | 
			
		||||
                                                     bool switch_sign)
 | 
			
		||||
{
 | 
			
		||||
    SitePropagator result;
 | 
			
		||||
    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
 | 
			
		||||
    result = WilsonCurrentBwd(result, mu);
 | 
			
		||||
 | 
			
		||||
    // Zero any unwanted timeslice entries.
 | 
			
		||||
    result = predicatedWhere(t_mask, result, 0.*result);
 | 
			
		||||
 | 
			
		||||
    if (switch_sign)
 | 
			
		||||
    {
 | 
			
		||||
        q_out += result;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        q_out -= result;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
AdjointFermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
 
 | 
			
		||||
@@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
 | 
			
		||||
  typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
 | 
			
		||||
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 | 
			
		||||
  {
 | 
			
		||||
@@ -99,7 +99,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
     
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 | 
			
		||||
  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
 | 
			
		||||
  DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
	   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
 | 
			
		||||
    // no kernel choice  
 | 
			
		||||
@@ -116,7 +116,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
     
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
 | 
			
		||||
  typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
 | 
			
		||||
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
	      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) 
 | 
			
		||||
{
 | 
			
		||||
@@ -161,7 +161,7 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
 | 
			
		||||
  typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
 | 
			
		||||
  DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
 | 
			
		||||
 | 
			
		||||
@@ -180,6 +180,38 @@ public:
 | 
			
		||||
  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 | 
			
		||||
		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
 | 
			
		||||
      
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Utilities for inserting Wilson conserved current.
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
 | 
			
		||||
                                       const SitePropagator &q_in_2,
 | 
			
		||||
                                       SitePropagator &q_out,
 | 
			
		||||
                                       DoubledGaugeField &U,
 | 
			
		||||
                                       unsigned int sU,
 | 
			
		||||
                                       unsigned int mu,
 | 
			
		||||
                                       bool switch_sign = false);
 | 
			
		||||
  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
 | 
			
		||||
                                       const SitePropagator &q_in_2,
 | 
			
		||||
                                       SitePropagator &q_out,
 | 
			
		||||
                                       DoubledGaugeField &U,
 | 
			
		||||
                                       unsigned int sU,
 | 
			
		||||
                                       unsigned int mu,
 | 
			
		||||
                                       bool switch_sign = false);
 | 
			
		||||
  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
 | 
			
		||||
                                  SitePropagator &q_out,
 | 
			
		||||
                                  DoubledGaugeField &U,
 | 
			
		||||
                                  unsigned int sU,
 | 
			
		||||
                                  unsigned int mu,
 | 
			
		||||
                                  vInteger t_mask,
 | 
			
		||||
                                  bool switch_sign = false);
 | 
			
		||||
  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
 | 
			
		||||
                                  SitePropagator &q_out,
 | 
			
		||||
                                  DoubledGaugeField &U,
 | 
			
		||||
                                  unsigned int sU,
 | 
			
		||||
                                  unsigned int mu,
 | 
			
		||||
                                  vInteger t_mask,
 | 
			
		||||
                                  bool switch_sign = false);
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
     // Specialised variants
 | 
			
		||||
  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
 
 | 
			
		||||
@@ -30,181 +30,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#define REGISTER
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU_BODY(F)			\
 | 
			
		||||
  Chimu_00=ref(F)(0)(0);			\
 | 
			
		||||
  Chimu_01=ref(F)(0)(1);			\
 | 
			
		||||
  Chimu_02=ref(F)(0)(2);			\
 | 
			
		||||
  Chimu_10=ref(F)(1)(0);			\
 | 
			
		||||
  Chimu_11=ref(F)(1)(1);			\
 | 
			
		||||
  Chimu_12=ref(F)(1)(2);			\
 | 
			
		||||
  Chimu_20=ref(F)(2)(0);			\
 | 
			
		||||
  Chimu_21=ref(F)(2)(1);			\
 | 
			
		||||
  Chimu_22=ref(F)(2)(2);			\
 | 
			
		||||
  Chimu_30=ref(F)(3)(0);			\
 | 
			
		||||
  Chimu_31=ref(F)(3)(1);			\
 | 
			
		||||
  Chimu_32=ref(F)(3)(2)
 | 
			
		||||
#define LOAD_CHIMU \
 | 
			
		||||
  {const SiteSpinor & ref (in._odata[offset]);	\
 | 
			
		||||
    Chimu_00=ref()(0)(0);\
 | 
			
		||||
    Chimu_01=ref()(0)(1);\
 | 
			
		||||
    Chimu_02=ref()(0)(2);\
 | 
			
		||||
    Chimu_10=ref()(1)(0);\
 | 
			
		||||
    Chimu_11=ref()(1)(1);\
 | 
			
		||||
    Chimu_12=ref()(1)(2);\
 | 
			
		||||
    Chimu_20=ref()(2)(0);\
 | 
			
		||||
    Chimu_21=ref()(2)(1);\
 | 
			
		||||
    Chimu_22=ref()(2)(2);\
 | 
			
		||||
    Chimu_30=ref()(3)(0);\
 | 
			
		||||
    Chimu_31=ref()(3)(1);\
 | 
			
		||||
    Chimu_32=ref()(3)(2);}
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU(DIR,F,PERM)						\
 | 
			
		||||
  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_BODY(F)				\
 | 
			
		||||
    Chi_00 = ref(F)(0)(0);\
 | 
			
		||||
    Chi_01 = ref(F)(0)(1);\
 | 
			
		||||
    Chi_02 = ref(F)(0)(2);\
 | 
			
		||||
    Chi_10 = ref(F)(1)(0);\
 | 
			
		||||
    Chi_11 = ref(F)(1)(1);\
 | 
			
		||||
    Chi_12 = ref(F)(1)(2)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI(DIR,F,PERM)					\
 | 
			
		||||
  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//G-parity implementations using in-place intrinsic ops
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
//0h,1l -> 1l,0h
 | 
			
		||||
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
 | 
			
		||||
//Pulled fermion through forwards face, GPBC on upper component
 | 
			
		||||
//Need 0= 0l 1h   1= 1l 0h
 | 
			
		||||
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
 | 
			
		||||
//Pulled fermion through backwards face, GPBC on lower component
 | 
			
		||||
//Need 0= 1l 0h   1= 0l 1h
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(1)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
//0l 0h -> 0h 0l
 | 
			
		||||
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
 | 
			
		||||
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(0)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_SETUP(DIR,F)						\
 | 
			
		||||
  g = F;								\
 | 
			
		||||
  direction = st._directions[DIR];				\
 | 
			
		||||
  distance = st._distances[DIR];				\
 | 
			
		||||
  sl = st._grid->_simd_layout[direction];			\
 | 
			
		||||
  inplace_twist = 0;						\
 | 
			
		||||
  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
 | 
			
		||||
    if(sl == 1){							\
 | 
			
		||||
      g = (F+1) % 2;							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      inplace_twist = 1;						\
 | 
			
		||||
    }									\
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
 | 
			
		||||
  { const SiteSpinor &ref(in._odata[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHIMU_BODY(g);						\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      } \
 | 
			
		||||
    } \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
 | 
			
		||||
  { const SiteHalfSpinor &ref(buf[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHI_BODY(g);							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }									\
 | 
			
		||||
    }									\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
#define LOAD_CHI\
 | 
			
		||||
  {const SiteHalfSpinor &ref(buf[offset]);	\
 | 
			
		||||
    Chi_00 = ref()(0)(0);\
 | 
			
		||||
    Chi_01 = ref()(0)(1);\
 | 
			
		||||
    Chi_02 = ref()(0)(2);\
 | 
			
		||||
    Chi_10 = ref()(1)(0);\
 | 
			
		||||
    Chi_11 = ref()(1)(1);\
 | 
			
		||||
    Chi_12 = ref()(1)(2);}
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN_BODY \
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
  UChi_00 = U_00*Chi_00;			\
 | 
			
		||||
  UChi_10 = U_00*Chi_10;			\
 | 
			
		||||
  UChi_01 = U_10*Chi_00;			\
 | 
			
		||||
  UChi_11 = U_10*Chi_10;			\
 | 
			
		||||
  UChi_02 = U_20*Chi_00;			\
 | 
			
		||||
  UChi_12 = U_20*Chi_10;			\
 | 
			
		||||
  UChi_00+= U_01*Chi_01;			\
 | 
			
		||||
  UChi_10+= U_01*Chi_11;			\
 | 
			
		||||
  UChi_01+= U_11*Chi_01;			\
 | 
			
		||||
  UChi_11+= U_11*Chi_11;			\
 | 
			
		||||
  UChi_02+= U_21*Chi_01;			\
 | 
			
		||||
  UChi_12+= U_21*Chi_11;			\
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
  UChi_00+= U_00*Chi_02;			\
 | 
			
		||||
  UChi_10+= U_00*Chi_12;			\
 | 
			
		||||
  UChi_01+= U_10*Chi_02;			\
 | 
			
		||||
  UChi_11+= U_10*Chi_12;			\
 | 
			
		||||
  UChi_02+= U_20*Chi_02;			\
 | 
			
		||||
  UChi_12+= U_20*Chi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN(A,F)					\
 | 
			
		||||
  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_GPARITY(A,F)				\
 | 
			
		||||
  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
 | 
			
		||||
#define MULT_2SPIN(A)\
 | 
			
		||||
  {auto & ref(U._odata[sU](A));			\
 | 
			
		||||
   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
    UChi_00 = U_00*Chi_00;\
 | 
			
		||||
    UChi_10 = U_00*Chi_10;\
 | 
			
		||||
    UChi_01 = U_10*Chi_00;\
 | 
			
		||||
    UChi_11 = U_10*Chi_10;\
 | 
			
		||||
    UChi_02 = U_20*Chi_00;\
 | 
			
		||||
    UChi_12 = U_20*Chi_10;\
 | 
			
		||||
    UChi_00+= U_01*Chi_01;\
 | 
			
		||||
    UChi_10+= U_01*Chi_11;\
 | 
			
		||||
    UChi_01+= U_11*Chi_01;\
 | 
			
		||||
    UChi_11+= U_11*Chi_11;\
 | 
			
		||||
    UChi_02+= U_21*Chi_01;\
 | 
			
		||||
    UChi_12+= U_21*Chi_11;\
 | 
			
		||||
    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
    UChi_00+= U_00*Chi_02;\
 | 
			
		||||
    UChi_10+= U_00*Chi_12;\
 | 
			
		||||
    UChi_01+= U_10*Chi_02;\
 | 
			
		||||
    UChi_11+= U_10*Chi_12;\
 | 
			
		||||
    UChi_02+= U_20*Chi_02;\
 | 
			
		||||
    UChi_12+= U_20*Chi_12;}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
@@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  result_31-= UChi_11;	\
 | 
			
		||||
  result_32-= UChi_12;
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else {					\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
  }						\
 | 
			
		||||
  MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
  MULT_2SPIN(DIR);				\
 | 
			
		||||
  RECON;					
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else if ( st.same_node[DIR] ) {		\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
  }						\
 | 
			
		||||
  if (local || st.same_node[DIR] ) {		\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
    nmu++;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT(ss,F)			\
 | 
			
		||||
#define HAND_RESULT(ss)				\
 | 
			
		||||
  {						\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    vstream(ref(F)(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref(F)(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref(F)(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref(F)(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref(F)(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref(F)(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref(F)(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref(F)(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref(F)(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref(F)(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref(F)(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref(F)(3)(2),result_32);		\
 | 
			
		||||
    vstream(ref()(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref()(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref()(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref()(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref()(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref()(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref()(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref()(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref()(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref()(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref()(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref()(3)(2),result_32);		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT_EXT(ss,F)			\
 | 
			
		||||
#define HAND_RESULT_EXT(ss)			\
 | 
			
		||||
  if (nmu){					\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    ref(F)(0)(0)+=result_00;		\
 | 
			
		||||
    ref(F)(0)(1)+=result_01;		\
 | 
			
		||||
    ref(F)(0)(2)+=result_02;		\
 | 
			
		||||
    ref(F)(1)(0)+=result_10;		\
 | 
			
		||||
    ref(F)(1)(1)+=result_11;		\
 | 
			
		||||
    ref(F)(1)(2)+=result_12;		\
 | 
			
		||||
    ref(F)(2)(0)+=result_20;		\
 | 
			
		||||
    ref(F)(2)(1)+=result_21;		\
 | 
			
		||||
    ref(F)(2)(2)+=result_22;		\
 | 
			
		||||
    ref(F)(3)(0)+=result_30;		\
 | 
			
		||||
    ref(F)(3)(1)+=result_31;		\
 | 
			
		||||
    ref(F)(3)(2)+=result_32;		\
 | 
			
		||||
    ref()(0)(0)+=result_00;		\
 | 
			
		||||
    ref()(0)(1)+=result_01;		\
 | 
			
		||||
    ref()(0)(2)+=result_02;		\
 | 
			
		||||
    ref()(1)(0)+=result_10;		\
 | 
			
		||||
    ref()(1)(1)+=result_11;		\
 | 
			
		||||
    ref()(1)(2)+=result_12;		\
 | 
			
		||||
    ref()(2)(0)+=result_20;		\
 | 
			
		||||
    ref()(2)(1)+=result_21;		\
 | 
			
		||||
    ref()(2)(2)+=result_22;		\
 | 
			
		||||
    ref()(3)(0)+=result_30;		\
 | 
			
		||||
    ref()(3)(1)+=result_31;		\
 | 
			
		||||
    ref()(3)(2)+=result_32;		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -612,19 +485,16 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
@@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
 | 
			
		||||
  ZERO_RESULT;							\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
  
 | 
			
		||||
  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
@@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT_EXT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT_EXT(ss);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Specialise Gparity to simple implementation
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
#define HAND_SPECIALISE_EMPTY(IMPL)					\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,			\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,			\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,			\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,			\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,	       	\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,	       	\
 | 
			
		||||
				    LebesgueOrder &lo,			\
 | 
			
		||||
				    DoubledGaugeField &U,		\
 | 
			
		||||
				    SiteHalfSpinor *buf,		\
 | 
			
		||||
				    int sF,int sU,			\
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define HAND_SPECIALISE_GPARITY(IMPL)					\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
				    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
					    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
@@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplF);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplD);
 | 
			
		||||
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
 | 
			
		||||
@@ -940,11 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH);
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplDF);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplFH);
 | 
			
		||||
INSTANTIATE_THEM(ZWilsonImplDF);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplFH);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplDF);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplFH);
 | 
			
		||||
INSTANTIATE_THEM(DomainWallVec5dImplDF);
 | 
			
		||||
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
 | 
			
		||||
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
 | 
			
		||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
 | 
			
		||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										878
									
								
								lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										878
									
								
								lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,878 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
 | 
			
		||||
#define REGISTER
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU_BODY(F)			\
 | 
			
		||||
  Chimu_00=ref(F)(0)(0);			\
 | 
			
		||||
  Chimu_01=ref(F)(0)(1);			\
 | 
			
		||||
  Chimu_02=ref(F)(0)(2);			\
 | 
			
		||||
  Chimu_10=ref(F)(1)(0);			\
 | 
			
		||||
  Chimu_11=ref(F)(1)(1);			\
 | 
			
		||||
  Chimu_12=ref(F)(1)(2);			\
 | 
			
		||||
  Chimu_20=ref(F)(2)(0);			\
 | 
			
		||||
  Chimu_21=ref(F)(2)(1);			\
 | 
			
		||||
  Chimu_22=ref(F)(2)(2);			\
 | 
			
		||||
  Chimu_30=ref(F)(3)(0);			\
 | 
			
		||||
  Chimu_31=ref(F)(3)(1);			\
 | 
			
		||||
  Chimu_32=ref(F)(3)(2)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU(DIR,F,PERM)						\
 | 
			
		||||
  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_BODY(F)				\
 | 
			
		||||
    Chi_00 = ref(F)(0)(0);\
 | 
			
		||||
    Chi_01 = ref(F)(0)(1);\
 | 
			
		||||
    Chi_02 = ref(F)(0)(2);\
 | 
			
		||||
    Chi_10 = ref(F)(1)(0);\
 | 
			
		||||
    Chi_11 = ref(F)(1)(1);\
 | 
			
		||||
    Chi_12 = ref(F)(1)(2)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI(DIR,F,PERM)					\
 | 
			
		||||
  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//G-parity implementations using in-place intrinsic ops
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
//0h,1l -> 1l,0h
 | 
			
		||||
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
 | 
			
		||||
//Pulled fermion through forwards face, GPBC on upper component
 | 
			
		||||
//Need 0= 0l 1h   1= 1l 0h
 | 
			
		||||
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
 | 
			
		||||
//Pulled fermion through backwards face, GPBC on lower component
 | 
			
		||||
//Need 0= 1l 0h   1= 0l 1h
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(1)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
//0l 0h -> 0h 0l
 | 
			
		||||
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
 | 
			
		||||
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(0)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_SETUP(DIR,F)						\
 | 
			
		||||
  g = F;								\
 | 
			
		||||
  direction = st._directions[DIR];				\
 | 
			
		||||
  distance = st._distances[DIR];				\
 | 
			
		||||
  sl = st._grid->_simd_layout[direction];			\
 | 
			
		||||
  inplace_twist = 0;						\
 | 
			
		||||
  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
 | 
			
		||||
    if(sl == 1){							\
 | 
			
		||||
      g = (F+1) % 2;							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      inplace_twist = 1;						\
 | 
			
		||||
    }									\
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
 | 
			
		||||
  { const SiteSpinor &ref(in._odata[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHIMU_BODY(g);						\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      } \
 | 
			
		||||
    } \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
 | 
			
		||||
  { const SiteHalfSpinor &ref(buf[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHI_BODY(g);							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }									\
 | 
			
		||||
    }									\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN_BODY \
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
  UChi_00 = U_00*Chi_00;			\
 | 
			
		||||
  UChi_10 = U_00*Chi_10;			\
 | 
			
		||||
  UChi_01 = U_10*Chi_00;			\
 | 
			
		||||
  UChi_11 = U_10*Chi_10;			\
 | 
			
		||||
  UChi_02 = U_20*Chi_00;			\
 | 
			
		||||
  UChi_12 = U_20*Chi_10;			\
 | 
			
		||||
  UChi_00+= U_01*Chi_01;			\
 | 
			
		||||
  UChi_10+= U_01*Chi_11;			\
 | 
			
		||||
  UChi_01+= U_11*Chi_01;			\
 | 
			
		||||
  UChi_11+= U_11*Chi_11;			\
 | 
			
		||||
  UChi_02+= U_21*Chi_01;			\
 | 
			
		||||
  UChi_12+= U_21*Chi_11;			\
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
  UChi_00+= U_00*Chi_02;			\
 | 
			
		||||
  UChi_10+= U_00*Chi_12;			\
 | 
			
		||||
  UChi_01+= U_10*Chi_02;			\
 | 
			
		||||
  UChi_11+= U_10*Chi_12;			\
 | 
			
		||||
  UChi_02+= U_20*Chi_02;			\
 | 
			
		||||
  UChi_12+= U_20*Chi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN(A,F)					\
 | 
			
		||||
  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_GPARITY(A,F)				\
 | 
			
		||||
  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
      permute##dir(Chi_00,Chi_00);\
 | 
			
		||||
      permute##dir(Chi_01,Chi_01);\
 | 
			
		||||
      permute##dir(Chi_02,Chi_02);\
 | 
			
		||||
      permute##dir(Chi_10,Chi_10);\
 | 
			
		||||
      permute##dir(Chi_11,Chi_11);\
 | 
			
		||||
      permute##dir(Chi_12,Chi_12);
 | 
			
		||||
 | 
			
		||||
//      hspin(0)=fspin(0)+timesI(fspin(3));
 | 
			
		||||
//      hspin(1)=fspin(1)+timesI(fspin(2));
 | 
			
		||||
#define XP_PROJ \
 | 
			
		||||
    Chi_00 = Chimu_00+timesI(Chimu_30);\
 | 
			
		||||
    Chi_01 = Chimu_01+timesI(Chimu_31);\
 | 
			
		||||
    Chi_02 = Chimu_02+timesI(Chimu_32);\
 | 
			
		||||
    Chi_10 = Chimu_10+timesI(Chimu_20);\
 | 
			
		||||
    Chi_11 = Chimu_11+timesI(Chimu_21);\
 | 
			
		||||
    Chi_12 = Chimu_12+timesI(Chimu_22);
 | 
			
		||||
 | 
			
		||||
#define YP_PROJ \
 | 
			
		||||
    Chi_00 = Chimu_00-Chimu_30;\
 | 
			
		||||
    Chi_01 = Chimu_01-Chimu_31;\
 | 
			
		||||
    Chi_02 = Chimu_02-Chimu_32;\
 | 
			
		||||
    Chi_10 = Chimu_10+Chimu_20;\
 | 
			
		||||
    Chi_11 = Chimu_11+Chimu_21;\
 | 
			
		||||
    Chi_12 = Chimu_12+Chimu_22;
 | 
			
		||||
 | 
			
		||||
#define ZP_PROJ \
 | 
			
		||||
  Chi_00 = Chimu_00+timesI(Chimu_20);		\
 | 
			
		||||
  Chi_01 = Chimu_01+timesI(Chimu_21);		\
 | 
			
		||||
  Chi_02 = Chimu_02+timesI(Chimu_22);		\
 | 
			
		||||
  Chi_10 = Chimu_10-timesI(Chimu_30);		\
 | 
			
		||||
  Chi_11 = Chimu_11-timesI(Chimu_31);		\
 | 
			
		||||
  Chi_12 = Chimu_12-timesI(Chimu_32);
 | 
			
		||||
 | 
			
		||||
#define TP_PROJ \
 | 
			
		||||
  Chi_00 = Chimu_00+Chimu_20;		\
 | 
			
		||||
  Chi_01 = Chimu_01+Chimu_21;		\
 | 
			
		||||
  Chi_02 = Chimu_02+Chimu_22;		\
 | 
			
		||||
  Chi_10 = Chimu_10+Chimu_30;		\
 | 
			
		||||
  Chi_11 = Chimu_11+Chimu_31;		\
 | 
			
		||||
  Chi_12 = Chimu_12+Chimu_32;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//      hspin(0)=fspin(0)-timesI(fspin(3));
 | 
			
		||||
//      hspin(1)=fspin(1)-timesI(fspin(2));
 | 
			
		||||
#define XM_PROJ \
 | 
			
		||||
    Chi_00 = Chimu_00-timesI(Chimu_30);\
 | 
			
		||||
    Chi_01 = Chimu_01-timesI(Chimu_31);\
 | 
			
		||||
    Chi_02 = Chimu_02-timesI(Chimu_32);\
 | 
			
		||||
    Chi_10 = Chimu_10-timesI(Chimu_20);\
 | 
			
		||||
    Chi_11 = Chimu_11-timesI(Chimu_21);\
 | 
			
		||||
    Chi_12 = Chimu_12-timesI(Chimu_22);
 | 
			
		||||
 | 
			
		||||
#define YM_PROJ \
 | 
			
		||||
    Chi_00 = Chimu_00+Chimu_30;\
 | 
			
		||||
    Chi_01 = Chimu_01+Chimu_31;\
 | 
			
		||||
    Chi_02 = Chimu_02+Chimu_32;\
 | 
			
		||||
    Chi_10 = Chimu_10-Chimu_20;\
 | 
			
		||||
    Chi_11 = Chimu_11-Chimu_21;\
 | 
			
		||||
    Chi_12 = Chimu_12-Chimu_22;
 | 
			
		||||
 | 
			
		||||
#define ZM_PROJ \
 | 
			
		||||
  Chi_00 = Chimu_00-timesI(Chimu_20);		\
 | 
			
		||||
  Chi_01 = Chimu_01-timesI(Chimu_21);		\
 | 
			
		||||
  Chi_02 = Chimu_02-timesI(Chimu_22);		\
 | 
			
		||||
  Chi_10 = Chimu_10+timesI(Chimu_30);		\
 | 
			
		||||
  Chi_11 = Chimu_11+timesI(Chimu_31);		\
 | 
			
		||||
  Chi_12 = Chimu_12+timesI(Chimu_32);
 | 
			
		||||
 | 
			
		||||
#define TM_PROJ \
 | 
			
		||||
  Chi_00 = Chimu_00-Chimu_20;		\
 | 
			
		||||
  Chi_01 = Chimu_01-Chimu_21;		\
 | 
			
		||||
  Chi_02 = Chimu_02-Chimu_22;		\
 | 
			
		||||
  Chi_10 = Chimu_10-Chimu_30;		\
 | 
			
		||||
  Chi_11 = Chimu_11-Chimu_31;		\
 | 
			
		||||
  Chi_12 = Chimu_12-Chimu_32;
 | 
			
		||||
 | 
			
		||||
//      fspin(0)=hspin(0);
 | 
			
		||||
//      fspin(1)=hspin(1);
 | 
			
		||||
//      fspin(2)=timesMinusI(hspin(1));
 | 
			
		||||
//      fspin(3)=timesMinusI(hspin(0));
 | 
			
		||||
#define XP_RECON\
 | 
			
		||||
  result_00 = UChi_00;\
 | 
			
		||||
  result_01 = UChi_01;\
 | 
			
		||||
  result_02 = UChi_02;\
 | 
			
		||||
  result_10 = UChi_10;\
 | 
			
		||||
  result_11 = UChi_11;\
 | 
			
		||||
  result_12 = UChi_12;\
 | 
			
		||||
  result_20 = timesMinusI(UChi_10);\
 | 
			
		||||
  result_21 = timesMinusI(UChi_11);\
 | 
			
		||||
  result_22 = timesMinusI(UChi_12);\
 | 
			
		||||
  result_30 = timesMinusI(UChi_00);\
 | 
			
		||||
  result_31 = timesMinusI(UChi_01);\
 | 
			
		||||
  result_32 = timesMinusI(UChi_02);
 | 
			
		||||
 | 
			
		||||
#define XP_RECON_ACCUM\
 | 
			
		||||
  result_00+=UChi_00;\
 | 
			
		||||
  result_01+=UChi_01;\
 | 
			
		||||
  result_02+=UChi_02;\
 | 
			
		||||
  result_10+=UChi_10;\
 | 
			
		||||
  result_11+=UChi_11;\
 | 
			
		||||
  result_12+=UChi_12;\
 | 
			
		||||
  result_20-=timesI(UChi_10);\
 | 
			
		||||
  result_21-=timesI(UChi_11);\
 | 
			
		||||
  result_22-=timesI(UChi_12);\
 | 
			
		||||
  result_30-=timesI(UChi_00);\
 | 
			
		||||
  result_31-=timesI(UChi_01);\
 | 
			
		||||
  result_32-=timesI(UChi_02);
 | 
			
		||||
 | 
			
		||||
#define XM_RECON\
 | 
			
		||||
  result_00 = UChi_00;\
 | 
			
		||||
  result_01 = UChi_01;\
 | 
			
		||||
  result_02 = UChi_02;\
 | 
			
		||||
  result_10 = UChi_10;\
 | 
			
		||||
  result_11 = UChi_11;\
 | 
			
		||||
  result_12 = UChi_12;\
 | 
			
		||||
  result_20 = timesI(UChi_10);\
 | 
			
		||||
  result_21 = timesI(UChi_11);\
 | 
			
		||||
  result_22 = timesI(UChi_12);\
 | 
			
		||||
  result_30 = timesI(UChi_00);\
 | 
			
		||||
  result_31 = timesI(UChi_01);\
 | 
			
		||||
  result_32 = timesI(UChi_02);
 | 
			
		||||
 | 
			
		||||
#define XM_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20+= timesI(UChi_10);\
 | 
			
		||||
  result_21+= timesI(UChi_11);\
 | 
			
		||||
  result_22+= timesI(UChi_12);\
 | 
			
		||||
  result_30+= timesI(UChi_00);\
 | 
			
		||||
  result_31+= timesI(UChi_01);\
 | 
			
		||||
  result_32+= timesI(UChi_02);
 | 
			
		||||
 | 
			
		||||
#define YP_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20+= UChi_10;\
 | 
			
		||||
  result_21+= UChi_11;\
 | 
			
		||||
  result_22+= UChi_12;\
 | 
			
		||||
  result_30-= UChi_00;\
 | 
			
		||||
  result_31-= UChi_01;\
 | 
			
		||||
  result_32-= UChi_02;
 | 
			
		||||
 | 
			
		||||
#define YM_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20-= UChi_10;\
 | 
			
		||||
  result_21-= UChi_11;\
 | 
			
		||||
  result_22-= UChi_12;\
 | 
			
		||||
  result_30+= UChi_00;\
 | 
			
		||||
  result_31+= UChi_01;\
 | 
			
		||||
  result_32+= UChi_02;
 | 
			
		||||
 | 
			
		||||
#define ZP_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20-= timesI(UChi_00);			\
 | 
			
		||||
  result_21-= timesI(UChi_01);			\
 | 
			
		||||
  result_22-= timesI(UChi_02);			\
 | 
			
		||||
  result_30+= timesI(UChi_10);			\
 | 
			
		||||
  result_31+= timesI(UChi_11);			\
 | 
			
		||||
  result_32+= timesI(UChi_12);
 | 
			
		||||
 | 
			
		||||
#define ZM_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20+= timesI(UChi_00);			\
 | 
			
		||||
  result_21+= timesI(UChi_01);			\
 | 
			
		||||
  result_22+= timesI(UChi_02);			\
 | 
			
		||||
  result_30-= timesI(UChi_10);			\
 | 
			
		||||
  result_31-= timesI(UChi_11);			\
 | 
			
		||||
  result_32-= timesI(UChi_12);
 | 
			
		||||
 | 
			
		||||
#define TP_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20+= UChi_00;			\
 | 
			
		||||
  result_21+= UChi_01;			\
 | 
			
		||||
  result_22+= UChi_02;			\
 | 
			
		||||
  result_30+= UChi_10;			\
 | 
			
		||||
  result_31+= UChi_11;			\
 | 
			
		||||
  result_32+= UChi_12;
 | 
			
		||||
 | 
			
		||||
#define TM_RECON_ACCUM\
 | 
			
		||||
  result_00+= UChi_00;\
 | 
			
		||||
  result_01+= UChi_01;\
 | 
			
		||||
  result_02+= UChi_02;\
 | 
			
		||||
  result_10+= UChi_10;\
 | 
			
		||||
  result_11+= UChi_11;\
 | 
			
		||||
  result_12+= UChi_12;\
 | 
			
		||||
  result_20-= UChi_00;	\
 | 
			
		||||
  result_21-= UChi_01;	\
 | 
			
		||||
  result_22-= UChi_02;	\
 | 
			
		||||
  result_30-= UChi_10;	\
 | 
			
		||||
  result_31-= UChi_11;	\
 | 
			
		||||
  result_32-= UChi_12;
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else {					\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
  }						\
 | 
			
		||||
  MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
  RECON;					
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else if ( st.same_node[DIR] ) {		\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
  }						\
 | 
			
		||||
  if (local || st.same_node[DIR] ) {		\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
    nmu++;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT(ss,F)			\
 | 
			
		||||
  {						\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    vstream(ref(F)(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref(F)(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref(F)(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref(F)(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref(F)(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref(F)(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref(F)(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref(F)(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref(F)(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref(F)(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref(F)(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref(F)(3)(2),result_32);		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT_EXT(ss,F)			\
 | 
			
		||||
  if (nmu){					\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    ref(F)(0)(0)+=result_00;		\
 | 
			
		||||
    ref(F)(0)(1)+=result_01;		\
 | 
			
		||||
    ref(F)(0)(2)+=result_02;		\
 | 
			
		||||
    ref(F)(1)(0)+=result_10;		\
 | 
			
		||||
    ref(F)(1)(1)+=result_11;		\
 | 
			
		||||
    ref(F)(1)(2)+=result_12;		\
 | 
			
		||||
    ref(F)(2)(0)+=result_20;		\
 | 
			
		||||
    ref(F)(2)(1)+=result_21;		\
 | 
			
		||||
    ref(F)(2)(2)+=result_22;		\
 | 
			
		||||
    ref(F)(3)(0)+=result_30;		\
 | 
			
		||||
    ref(F)(3)(1)+=result_31;		\
 | 
			
		||||
    ref(F)(3)(2)+=result_32;		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define HAND_DECLARATIONS(a)			\
 | 
			
		||||
  Simd result_00;				\
 | 
			
		||||
  Simd result_01;				\
 | 
			
		||||
  Simd result_02;				\
 | 
			
		||||
  Simd result_10;				\
 | 
			
		||||
  Simd result_11;				\
 | 
			
		||||
  Simd result_12;				\
 | 
			
		||||
  Simd result_20;				\
 | 
			
		||||
  Simd result_21;				\
 | 
			
		||||
  Simd result_22;				\
 | 
			
		||||
  Simd result_30;				\
 | 
			
		||||
  Simd result_31;				\
 | 
			
		||||
  Simd result_32;				\
 | 
			
		||||
  Simd Chi_00;					\
 | 
			
		||||
  Simd Chi_01;					\
 | 
			
		||||
  Simd Chi_02;					\
 | 
			
		||||
  Simd Chi_10;					\
 | 
			
		||||
  Simd Chi_11;					\
 | 
			
		||||
  Simd Chi_12;					\
 | 
			
		||||
  Simd UChi_00;					\
 | 
			
		||||
  Simd UChi_01;					\
 | 
			
		||||
  Simd UChi_02;					\
 | 
			
		||||
  Simd UChi_10;					\
 | 
			
		||||
  Simd UChi_11;					\
 | 
			
		||||
  Simd UChi_12;					\
 | 
			
		||||
  Simd U_00;					\
 | 
			
		||||
  Simd U_10;					\
 | 
			
		||||
  Simd U_20;					\
 | 
			
		||||
  Simd U_01;					\
 | 
			
		||||
  Simd U_11;					\
 | 
			
		||||
  Simd U_21;
 | 
			
		||||
 | 
			
		||||
#define ZERO_RESULT				\
 | 
			
		||||
  result_00=zero;				\
 | 
			
		||||
  result_01=zero;				\
 | 
			
		||||
  result_02=zero;				\
 | 
			
		||||
  result_10=zero;				\
 | 
			
		||||
  result_11=zero;				\
 | 
			
		||||
  result_12=zero;				\
 | 
			
		||||
  result_20=zero;				\
 | 
			
		||||
  result_21=zero;				\
 | 
			
		||||
  result_22=zero;				\
 | 
			
		||||
  result_30=zero;				\
 | 
			
		||||
  result_31=zero;				\
 | 
			
		||||
  result_32=zero;			
 | 
			
		||||
 | 
			
		||||
#define Chimu_00 Chi_00
 | 
			
		||||
#define Chimu_01 Chi_01
 | 
			
		||||
#define Chimu_02 Chi_02
 | 
			
		||||
#define Chimu_10 Chi_10
 | 
			
		||||
#define Chimu_11 Chi_11
 | 
			
		||||
#define Chimu_12 Chi_12
 | 
			
		||||
#define Chimu_20 UChi_00
 | 
			
		||||
#define Chimu_21 UChi_01
 | 
			
		||||
#define Chimu_22 UChi_02
 | 
			
		||||
#define Chimu_30 UChi_10
 | 
			
		||||
#define Chimu_31 UChi_11
 | 
			
		||||
#define Chimu_32 UChi_12
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 | 
			
		||||
					  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
						  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 | 
			
		||||
					  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
						  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
 | 
			
		||||
  ZERO_RESULT;							\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
  
 | 
			
		||||
  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 | 
			
		||||
					  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 | 
			
		||||
						  int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
  typedef typename Simd::vector_type V;
 | 
			
		||||
 | 
			
		||||
  HAND_DECLARATIONS(ignore);
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define HAND_SPECIALISE_GPARITY(IMPL)					\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
				    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
					    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 | 
			
		||||
					     int ss,int sU,const FermionField &in, FermionField &out); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out);\
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
						   int ss,int sU,const FermionField &in, FermionField &out); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out); \
 | 
			
		||||
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
						   int ss,int sU,const FermionField &in, FermionField &out); 
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplF);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplD);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplFH);
 | 
			
		||||
INSTANTIATE_THEM(GparityWilsonImplDF);
 | 
			
		||||
}}
 | 
			
		||||
@@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 | 
			
		||||
 | 
			
		||||
    RealD factor = 0.5 * beta / RealD(Nc);
 | 
			
		||||
 | 
			
		||||
    //GaugeLinkField Umu(U._grid);
 | 
			
		||||
    GaugeLinkField Umu(U._grid);
 | 
			
		||||
    GaugeLinkField dSdU_mu(U._grid);
 | 
			
		||||
    for (int mu = 0; mu < Nd; mu++) {
 | 
			
		||||
      //Umu = PeekIndex<LorentzIndex>(U, mu);
 | 
			
		||||
      Umu = PeekIndex<LorentzIndex>(U, mu);
 | 
			
		||||
 | 
			
		||||
      // Staple in direction mu
 | 
			
		||||
      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
 | 
			
		||||
      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
 | 
			
		||||
      dSdU_mu = Ta(dSdU_mu) * factor;
 | 
			
		||||
      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
 | 
			
		||||
      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
 | 
			
		||||
 | 
			
		||||
      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -16,12 +16,12 @@ class ScalarImplTypes {
 | 
			
		||||
    typedef iImplField<Simd> SiteField;
 | 
			
		||||
    typedef SiteField        SitePropagator;
 | 
			
		||||
    typedef SiteField        SiteComplex;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<SiteField> Field;
 | 
			
		||||
    typedef Field              ComplexField;
 | 
			
		||||
    typedef Field              FermionField;
 | 
			
		||||
    typedef Field              PropagatorField;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
 | 
			
		||||
      gaussian(pRNG, P);
 | 
			
		||||
    }
 | 
			
		||||
@@ -47,54 +47,60 @@ class ScalarImplTypes {
 | 
			
		||||
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
 | 
			
		||||
      U = 1.0;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    static void MomentumSpacePropagator(Field &out, RealD m)
 | 
			
		||||
    {
 | 
			
		||||
      GridBase           *grid = out._grid;
 | 
			
		||||
      Field              kmu(grid), one(grid);
 | 
			
		||||
      const unsigned int nd    = grid->_ndimension;
 | 
			
		||||
      std::vector<int>   &l    = grid->_fdimensions;
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      one = Complex(1.0,0.0);
 | 
			
		||||
      out = m*m;
 | 
			
		||||
      for(int mu = 0; mu < nd; mu++)
 | 
			
		||||
      {
 | 
			
		||||
        Real twoPiL = M_PI*2./l[mu];
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        LatticeCoordinate(kmu,mu);
 | 
			
		||||
        kmu = 2.*sin(.5*twoPiL*kmu);
 | 
			
		||||
        out = out + kmu*kmu;
 | 
			
		||||
      }
 | 
			
		||||
      out = one/out;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    static void FreePropagator(const Field &in, Field &out,
 | 
			
		||||
                               const Field &momKernel)
 | 
			
		||||
    {
 | 
			
		||||
      FFT   fft((GridCartesian *)in._grid);
 | 
			
		||||
      Field inFT(in._grid);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      fft.FFT_all_dim(inFT, in, FFT::forward);
 | 
			
		||||
      inFT = inFT*momKernel;
 | 
			
		||||
      fft.FFT_all_dim(out, inFT, FFT::backward);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    static void FreePropagator(const Field &in, Field &out, RealD m)
 | 
			
		||||
    {
 | 
			
		||||
      Field momKernel(in._grid);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      MomentumSpacePropagator(momKernel, m);
 | 
			
		||||
      FreePropagator(in, out, momKernel);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  #ifdef  USE_FFT_ACCELERATION
 | 
			
		||||
  #ifndef FFT_MASS
 | 
			
		||||
  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
 | 
			
		||||
  #endif
 | 
			
		||||
  #endif
 | 
			
		||||
  
 | 
			
		||||
  template <class S, unsigned int N>
 | 
			
		||||
  class ScalarAdjMatrixImplTypes {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef S Simd;
 | 
			
		||||
    typedef QCD::SU<N> Group;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    template <typename vtype>
 | 
			
		||||
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
 | 
			
		||||
    template <typename vtype>
 | 
			
		||||
@@ -103,24 +109,119 @@ class ScalarImplTypes {
 | 
			
		||||
    typedef iImplField<Simd>   SiteField;
 | 
			
		||||
    typedef SiteField          SitePropagator;
 | 
			
		||||
    typedef iImplComplex<Simd> SiteComplex;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    typedef Lattice<SiteField>   Field;
 | 
			
		||||
    typedef Lattice<SiteComplex> ComplexField;
 | 
			
		||||
    typedef Field                FermionField;
 | 
			
		||||
    typedef Field                PropagatorField;
 | 
			
		||||
 | 
			
		||||
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
 | 
			
		||||
    static void MomentaSquare(ComplexField &out)
 | 
			
		||||
    {
 | 
			
		||||
      GridBase *grid = out._grid;
 | 
			
		||||
      const std::vector<int> &l = grid->FullDimensions();
 | 
			
		||||
      ComplexField kmu(grid);
 | 
			
		||||
 | 
			
		||||
      for (int mu = 0; mu < grid->Nd(); mu++)
 | 
			
		||||
      {
 | 
			
		||||
        Real twoPiL = M_PI * 2.0 / l[mu];
 | 
			
		||||
        LatticeCoordinate(kmu, mu);
 | 
			
		||||
        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
 | 
			
		||||
        out += kmu * kmu;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static void MomentumSpacePropagator(ComplexField &out, RealD m)
 | 
			
		||||
    {
 | 
			
		||||
      GridBase *grid = out._grid;
 | 
			
		||||
      ComplexField one(grid);
 | 
			
		||||
      one = Complex(1.0, 0.0);
 | 
			
		||||
      out = m * m;
 | 
			
		||||
      MomentaSquare(out);
 | 
			
		||||
      out = one / out;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
 | 
			
		||||
    {
 | 
			
		||||
#ifndef USE_FFT_ACCELERATION
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
      Field Pgaussian(P._grid), Pp(P._grid);
 | 
			
		||||
      ComplexField p2(P._grid); p2 = zero;
 | 
			
		||||
      RealD M = FFT_MASS;
 | 
			
		||||
      
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
 | 
			
		||||
 | 
			
		||||
      FFT theFFT((GridCartesian*)P._grid);
 | 
			
		||||
      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
 | 
			
		||||
      MomentaSquare(p2);
 | 
			
		||||
      p2 += M * M;
 | 
			
		||||
      p2 = sqrt(p2);
 | 
			
		||||
      Pp *= p2;
 | 
			
		||||
      theFFT.FFT_all_dim(P, Pp, FFT::backward);
 | 
			
		||||
 | 
			
		||||
#endif //USE_FFT_ACCELERATION
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline Field projectForce(Field& P) {return P;}
 | 
			
		||||
 | 
			
		||||
    static inline void update_field(Field& P, Field& U, double ep) {
 | 
			
		||||
      U += P*ep;
 | 
			
		||||
    static inline void update_field(Field &P, Field &U, double ep)
 | 
			
		||||
    {
 | 
			
		||||
#ifndef USE_FFT_ACCELERATION
 | 
			
		||||
      double t0=usecond(); 
 | 
			
		||||
      U += P * ep;
 | 
			
		||||
      double t1=usecond();
 | 
			
		||||
      double total_time = (t1-t0)/1e6;
 | 
			
		||||
      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
 | 
			
		||||
#else
 | 
			
		||||
      // FFT transform P(x) -> P(p)
 | 
			
		||||
      // divide by (M^2+p^2)  M external parameter (how to pass?)
 | 
			
		||||
      // P'(p) = P(p)/(M^2+p^2)
 | 
			
		||||
      // Transform back -> P'(x)
 | 
			
		||||
      // U += P'(x)*ep
 | 
			
		||||
 | 
			
		||||
      Field Pp(U._grid), P_FFT(U._grid);     
 | 
			
		||||
      static ComplexField p2(U._grid);
 | 
			
		||||
      RealD M = FFT_MASS;
 | 
			
		||||
      
 | 
			
		||||
      FFT theFFT((GridCartesian*)U._grid);
 | 
			
		||||
      theFFT.FFT_all_dim(Pp, P, FFT::forward);
 | 
			
		||||
 | 
			
		||||
      static bool first_call = true;
 | 
			
		||||
      if (first_call)
 | 
			
		||||
      {
 | 
			
		||||
        // avoid recomputing
 | 
			
		||||
        MomentumSpacePropagator(p2, M);
 | 
			
		||||
        first_call = false;
 | 
			
		||||
      }
 | 
			
		||||
      Pp *= p2;
 | 
			
		||||
      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
 | 
			
		||||
      U += P_FFT * ep;
 | 
			
		||||
 | 
			
		||||
#endif //USE_FFT_ACCELERATION
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline RealD FieldSquareNorm(Field& U) {
 | 
			
		||||
      return (TensorRemove(sum(trace(U*U))).real());
 | 
			
		||||
    static inline RealD FieldSquareNorm(Field &U)
 | 
			
		||||
    {
 | 
			
		||||
#ifndef USE_FFT_ACCELERATION
 | 
			
		||||
      return (TensorRemove(sum(trace(U * U))).real());
 | 
			
		||||
#else
 | 
			
		||||
      // In case of Fourier acceleration we have to:
 | 
			
		||||
      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
 | 
			
		||||
      // 1 FFT needed U(x) -> U(p)
 | 
			
		||||
      // M to be passed
 | 
			
		||||
 | 
			
		||||
      FFT theFFT((GridCartesian*)U._grid);
 | 
			
		||||
      Field Up(U._grid);
 | 
			
		||||
 | 
			
		||||
      theFFT.FFT_all_dim(Up, U, FFT::forward);
 | 
			
		||||
      RealD M = FFT_MASS;
 | 
			
		||||
      ComplexField p2(U._grid);
 | 
			
		||||
      MomentumSpacePropagator(p2, M);
 | 
			
		||||
      Field Up2 = Up * p2;
 | 
			
		||||
      // from the definition of the DFT we need to divide by the volume
 | 
			
		||||
      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
 | 
			
		||||
#endif //USE_FFT_ACCELERATION
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
 | 
			
		||||
@@ -146,7 +247,7 @@ class ScalarImplTypes {
 | 
			
		||||
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
 | 
			
		||||
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
 | 
			
		||||
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
  // Hardcoding here the size of the matrices
 | 
			
		||||
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
 | 
			
		||||
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
 | 
			
		||||
@@ -155,7 +256,7 @@ class ScalarImplTypes {
 | 
			
		||||
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
 | 
			
		||||
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
 | 
			
		||||
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  //}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -30,119 +30,179 @@ directory
 | 
			
		||||
#ifndef SCALAR_INT_ACTION_H
 | 
			
		||||
#define SCALAR_INT_ACTION_H
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Note: this action can completely absorb the ScalarAction for real float fields
 | 
			
		||||
// use the scalarObjs to generalise the structure
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
  // FIXME drop the QCD namespace everywhere here
 | 
			
		||||
namespace Grid
 | 
			
		||||
{
 | 
			
		||||
// FIXME drop the QCD namespace everywhere here
 | 
			
		||||
 | 
			
		||||
  template <class Impl, int Ndim >
 | 
			
		||||
  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
 | 
			
		||||
  public:
 | 
			
		||||
    INHERIT_FIELD_TYPES(Impl);
 | 
			
		||||
  private:
 | 
			
		||||
    RealD mass_square;
 | 
			
		||||
    RealD lambda;
 | 
			
		||||
template <class Impl, int Ndim>
 | 
			
		||||
class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  INHERIT_FIELD_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
  RealD mass_square;
 | 
			
		||||
  RealD lambda;
 | 
			
		||||
  RealD g;
 | 
			
		||||
  const unsigned int N = Impl::Group::Dimension;
 | 
			
		||||
 | 
			
		||||
    typedef typename Field::vector_object vobj;
 | 
			
		||||
    typedef CartesianStencil<vobj,vobj> Stencil;
 | 
			
		||||
  typedef typename Field::vector_object vobj;
 | 
			
		||||
  typedef CartesianStencil<vobj, vobj> Stencil;
 | 
			
		||||
 | 
			
		||||
    SimpleCompressor<vobj> compressor;
 | 
			
		||||
    int npoint = 2*Ndim;
 | 
			
		||||
    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
 | 
			
		||||
    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
 | 
			
		||||
  SimpleCompressor<vobj> compressor;
 | 
			
		||||
  int npoint = 2 * Ndim;
 | 
			
		||||
  std::vector<int> directions;    //
 | 
			
		||||
  std::vector<int> displacements; //
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
 | 
			
		||||
      for (int mu = 0 ; mu < Ndim; mu++){
 | 
			
		||||
		directions[mu]         = mu; directions[mu+Ndim]    = mu;
 | 
			
		||||
		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
 | 
			
		||||
      }
 | 
			
		||||
public:
 | 
			
		||||
  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
 | 
			
		||||
  {
 | 
			
		||||
    for (int mu = 0; mu < Ndim; mu++)
 | 
			
		||||
    {
 | 
			
		||||
      directions[mu] = mu;
 | 
			
		||||
      directions[mu + Ndim] = mu;
 | 
			
		||||
      displacements[mu] = 1;
 | 
			
		||||
      displacements[mu + Ndim] = -1;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    virtual std::string LogParameters() {
 | 
			
		||||
      std::stringstream sstream;
 | 
			
		||||
      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
 | 
			
		||||
      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
 | 
			
		||||
      return sstream.str();
 | 
			
		||||
    }
 | 
			
		||||
  virtual std::string LogParameters()
 | 
			
		||||
  {
 | 
			
		||||
    std::stringstream sstream;
 | 
			
		||||
    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
 | 
			
		||||
    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
 | 
			
		||||
    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
 | 
			
		||||
    return sstream.str();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    virtual std::string action_name() {return "ScalarAction";}
 | 
			
		||||
  virtual std::string action_name() { return "ScalarAction"; }
 | 
			
		||||
 | 
			
		||||
    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 | 
			
		||||
  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
 | 
			
		||||
 | 
			
		||||
    virtual RealD S(const Field &p) {
 | 
			
		||||
      assert(p._grid->Nd() == Ndim);
 | 
			
		||||
      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
 | 
			
		||||
      phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
 | 
			
		||||
      phisquared = p*p;
 | 
			
		||||
      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
 | 
			
		||||
      for (int mu = 0; mu < Ndim; mu++) {
 | 
			
		||||
	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 | 
			
		||||
	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 | 
			
		||||
	  int permute_type;
 | 
			
		||||
	  StencilEntry *SE;
 | 
			
		||||
	  vobj temp2;
 | 
			
		||||
	  const vobj *temp, *t_p;
 | 
			
		||||
	    
 | 
			
		||||
	  SE = phiStencil.GetEntry(permute_type, mu, i);
 | 
			
		||||
	  t_p  = &p._odata[i];
 | 
			
		||||
	  if ( SE->_is_local ) {
 | 
			
		||||
	    temp = &p._odata[SE->_offset];
 | 
			
		||||
	    if ( SE->_permute ) {
 | 
			
		||||
	      permute(temp2, *temp, permute_type);
 | 
			
		||||
	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 | 
			
		||||
	    } else {
 | 
			
		||||
	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 | 
			
		||||
	    }
 | 
			
		||||
	  } else {
 | 
			
		||||
	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	//  action -= pshift*p + p*pshift;
 | 
			
		||||
      }
 | 
			
		||||
      // NB the trace in the algebra is normalised to 1/2
 | 
			
		||||
      // minus sign coming from the antihermitian fields
 | 
			
		||||
      return -(TensorRemove(sum(trace(action)))).real();
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    virtual void deriv(const Field &p, Field &force) {
 | 
			
		||||
      assert(p._grid->Nd() == Ndim);
 | 
			
		||||
      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
 | 
			
		||||
      // move this outside
 | 
			
		||||
      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
 | 
			
		||||
      phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
      
 | 
			
		||||
      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
 | 
			
		||||
      for (int point = 0; point < npoint; point++) {
 | 
			
		||||
	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 | 
			
		||||
	  const vobj *temp;
 | 
			
		||||
	  vobj temp2;
 | 
			
		||||
	  int permute_type;
 | 
			
		||||
	  StencilEntry *SE;
 | 
			
		||||
	  SE = phiStencil.GetEntry(permute_type, point, i);
 | 
			
		||||
	  
 | 
			
		||||
	  if ( SE->_is_local ) {
 | 
			
		||||
	    temp = &p._odata[SE->_offset];
 | 
			
		||||
	    if ( SE->_permute ) {
 | 
			
		||||
	      permute(temp2, *temp, permute_type);
 | 
			
		||||
	      force._odata[i] -= temp2;
 | 
			
		||||
	    } else {
 | 
			
		||||
	      force._odata[i] -= *temp;
 | 
			
		||||
	    }
 | 
			
		||||
	  } else {
 | 
			
		||||
	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
  virtual RealD S(const Field &p)
 | 
			
		||||
  {
 | 
			
		||||
    assert(p._grid->Nd() == Ndim);
 | 
			
		||||
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
 | 
			
		||||
    phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
 | 
			
		||||
    phisquared = p * p;
 | 
			
		||||
    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
 | 
			
		||||
    for (int mu = 0; mu < Ndim; mu++)
 | 
			
		||||
    {
 | 
			
		||||
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 | 
			
		||||
      parallel_for(int i = 0; i < p._grid->oSites(); i++)
 | 
			
		||||
      {
 | 
			
		||||
        int permute_type;
 | 
			
		||||
        StencilEntry *SE;
 | 
			
		||||
        vobj temp2;
 | 
			
		||||
        const vobj *temp, *t_p;
 | 
			
		||||
 | 
			
		||||
        SE = phiStencil.GetEntry(permute_type, mu, i);
 | 
			
		||||
        t_p = &p._odata[i];
 | 
			
		||||
        if (SE->_is_local)
 | 
			
		||||
        {
 | 
			
		||||
          temp = &p._odata[SE->_offset];
 | 
			
		||||
          if (SE->_permute)
 | 
			
		||||
          {
 | 
			
		||||
            permute(temp2, *temp, permute_type);
 | 
			
		||||
            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
 | 
			
		||||
          }
 | 
			
		||||
          else
 | 
			
		||||
          {
 | 
			
		||||
            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      //  action -= pshift*p + p*pshift;
 | 
			
		||||
    }
 | 
			
		||||
    // NB the trace in the algebra is normalised to 1/2
 | 
			
		||||
    // minus sign coming from the antihermitian fields
 | 
			
		||||
    return -(TensorRemove(sum(trace(action)))).real() * N / g;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
}  // namespace Grid
 | 
			
		||||
 | 
			
		||||
#endif  // SCALAR_INT_ACTION_H
 | 
			
		||||
  virtual void deriv(const Field &p, Field &force)
 | 
			
		||||
  {
 | 
			
		||||
    double t0 = usecond();
 | 
			
		||||
    assert(p._grid->Nd() == Ndim);
 | 
			
		||||
    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
 | 
			
		||||
    double interm_t = usecond();
 | 
			
		||||
 | 
			
		||||
    // move this outside
 | 
			
		||||
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
 | 
			
		||||
 | 
			
		||||
    phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
    double halo_t = usecond();
 | 
			
		||||
    int chunk = 128;
 | 
			
		||||
    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
 | 
			
		||||
 | 
			
		||||
    // inverting the order of the loops slows down the code(! g++ 7)
 | 
			
		||||
    // cannot try to reduce the number of  force writes by factor npoint...
 | 
			
		||||
    // use cache blocking
 | 
			
		||||
    for (int point = 0; point < npoint; point++)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
{
 | 
			
		||||
        int permute_type;
 | 
			
		||||
        StencilEntry *SE;
 | 
			
		||||
        const vobj *temp;
 | 
			
		||||
 | 
			
		||||
#pragma omp for schedule(static, chunk)
 | 
			
		||||
      for (int i = 0; i < p._grid->oSites(); i++)
 | 
			
		||||
      {
 | 
			
		||||
        SE = phiStencil.GetEntry(permute_type, point, i);
 | 
			
		||||
        // prefetch next p?
 | 
			
		||||
 | 
			
		||||
        if (SE->_is_local)
 | 
			
		||||
        {
 | 
			
		||||
          temp = &p._odata[SE->_offset];
 | 
			
		||||
      
 | 
			
		||||
          if (SE->_permute)
 | 
			
		||||
          {
 | 
			
		||||
            vobj temp2;
 | 
			
		||||
            permute(temp2, *temp, permute_type);
 | 
			
		||||
            force._odata[i] -= temp2;
 | 
			
		||||
          }
 | 
			
		||||
          else
 | 
			
		||||
          {
 | 
			
		||||
            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  force *= N / g;
 | 
			
		||||
 | 
			
		||||
  double t1 = usecond();
 | 
			
		||||
  double total_time = (t1 - t0) / 1e6;
 | 
			
		||||
  double interm_time = (interm_t - t0) / 1e6;
 | 
			
		||||
  double halo_time = (halo_t - interm_t) / 1e6;
 | 
			
		||||
  double stencil_time = (t1 - halo_t) / 1e6;
 | 
			
		||||
  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
 | 
			
		||||
  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
 | 
			
		||||
  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
 | 
			
		||||
  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
 | 
			
		||||
  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
 | 
			
		||||
  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
 | 
			
		||||
  double Gflops = flops / (total_time * 1e9);
 | 
			
		||||
  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
 | 
			
		||||
  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
 | 
			
		||||
  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
 | 
			
		||||
}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
} // namespace Grid
 | 
			
		||||
 | 
			
		||||
#endif // SCALAR_INT_ACTION_H
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user