mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/clover
This commit is contained in:
		
							
								
								
									
										100
									
								
								lib/qcd/action/fermion/AbstractEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								lib/qcd/action/fermion/AbstractEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,100 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // DJM: Abstract base class for EOFA fermion types.
 | 
			
		||||
  // Defines layout of additional EOFA-specific parameters and operators.
 | 
			
		||||
  // Use to construct EOFA pseudofermion actions that are agnostic to
 | 
			
		||||
  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
 | 
			
		||||
  // pseudofermion action with non-EOFA fermion type.
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
 | 
			
		||||
      RealD mq1;
 | 
			
		||||
      RealD mq2;
 | 
			
		||||
      RealD mq3;
 | 
			
		||||
      RealD shift;
 | 
			
		||||
      int pm;
 | 
			
		||||
 | 
			
		||||
      RealD alpha; // Mobius scale
 | 
			
		||||
      RealD k;     // EOFA normalization constant
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) = 0;
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      // Force user to implement in derived classes
 | 
			
		||||
      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
 | 
			
		||||
      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
 | 
			
		||||
      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
 | 
			
		||||
 | 
			
		||||
      // Implement derivatives in base class:
 | 
			
		||||
      // for EOFA both DWF and Mobius just need d(Dw)/dU
 | 
			
		||||
      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDeriv(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDerivOE(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDerivEO(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      // Recompute 5D coefficients for different value of shift constant
 | 
			
		||||
      // (needed for heatbath loop over poles)
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
 | 
			
		||||
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
 | 
			
		||||
        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
 | 
			
		||||
          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
 | 
			
		||||
      {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        this->alpha = _b + _c;
 | 
			
		||||
        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
 | 
			
		||||
                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
 | 
			
		||||
                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
 | 
			
		||||
      };
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -77,7 +77,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 | 
			
		||||
{
 | 
			
		||||
  this->Report();
 | 
			
		||||
@@ -119,7 +118,6 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 | 
			
		||||
  MooeeInvTime=0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
@@ -414,7 +412,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  for(int i=0; i < Ls; i++){
 | 
			
		||||
    as[i] = 1.0;
 | 
			
		||||
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
 | 
			
		||||
    //    assert(fabs(omega[i])>0.0);
 | 
			
		||||
    assert(omega[i]!=Coeff_t(0.0));
 | 
			
		||||
    bs[i] = 0.5*(bpc/omega[i] + bmc);
 | 
			
		||||
    cs[i] = 0.5*(bpc/omega[i] - bmc);
 | 
			
		||||
  }
 | 
			
		||||
@@ -429,7 +427,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  
 | 
			
		||||
  for(int i=0;i<Ls;i++){
 | 
			
		||||
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
 | 
			
		||||
    //    assert(fabs(bee[i])>0.0);
 | 
			
		||||
    assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
 | 
			
		||||
    beo[i]=as[i]*bs[i];
 | 
			
		||||
    ceo[i]=-as[i]*cs[i];
 | 
			
		||||
@@ -455,11 +453,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
    dee[i] = bee[i];
 | 
			
		||||
    
 | 
			
		||||
    if ( i < Ls-1 ) {
 | 
			
		||||
 | 
			
		||||
      assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
      assert(bee[0]!=Coeff_t(0.0));
 | 
			
		||||
      
 | 
			
		||||
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
 | 
			
		||||
      
 | 
			
		||||
      leem[i]=mass*cee[Ls-1]/bee[0];
 | 
			
		||||
      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      for(int j=0;j<i;j++) {
 | 
			
		||||
	assert(bee[j+1]!=Coeff_t(0.0));
 | 
			
		||||
	leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
 | 
			
		||||
      
 | 
			
		||||
@@ -478,7 +482,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  { 
 | 
			
		||||
    Coeff_t delta_d=mass*cee[Ls-1];
 | 
			
		||||
    for(int j=0;j<Ls-1;j++) {
 | 
			
		||||
      //      assert(fabs(bee[j])>0.0);
 | 
			
		||||
      assert(bee[j] != Coeff_t(0.0));
 | 
			
		||||
      delta_d *= cee[j]/bee[j];
 | 
			
		||||
    }
 | 
			
		||||
    dee[Ls-1] += delta_d;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.h
 | 
			
		||||
 | 
			
		||||
@@ -35,24 +35,24 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  namespace QCD {
 | 
			
		||||
 | 
			
		||||
     template<typename T> struct switcheroo   {  
 | 
			
		||||
       static inline int iscomplex()  { return 0; } 
 | 
			
		||||
     template<typename T> struct switcheroo   {
 | 
			
		||||
       static inline int iscomplex()  { return 0; }
 | 
			
		||||
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return real_mult(a,b);
 | 
			
		||||
       }
 | 
			
		||||
     };
 | 
			
		||||
     template<> struct switcheroo<ComplexD> {  
 | 
			
		||||
       static inline int iscomplex()  { return 1; } 
 | 
			
		||||
     template<> struct switcheroo<ComplexD> {
 | 
			
		||||
       static inline int iscomplex()  { return 1; }
 | 
			
		||||
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return a*b;
 | 
			
		||||
       }
 | 
			
		||||
     };
 | 
			
		||||
     template<> struct switcheroo<ComplexF> {  
 | 
			
		||||
       static inline int iscomplex()  { return 1; } 
 | 
			
		||||
     template<> struct switcheroo<ComplexF> {
 | 
			
		||||
       static inline int iscomplex()  { return 1; }
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return a*b;
 | 
			
		||||
@@ -90,14 +90,14 @@ namespace Grid {
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField &psi,
 | 
			
		||||
	       const FermionField &phi, 
 | 
			
		||||
	       const FermionField &phi,
 | 
			
		||||
	       FermionField &chi,
 | 
			
		||||
	       std::vector<Coeff_t> &lower,
 | 
			
		||||
	       std::vector<Coeff_t> &diag,
 | 
			
		||||
	       std::vector<Coeff_t> &upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField &psi,
 | 
			
		||||
		  const FermionField &phi, 
 | 
			
		||||
		  const FermionField &phi,
 | 
			
		||||
		  FermionField &chi,
 | 
			
		||||
		  std::vector<Coeff_t> &lower,
 | 
			
		||||
		  std::vector<Coeff_t> &diag,
 | 
			
		||||
@@ -125,7 +125,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      // Efficient support for multigrid coarsening
 | 
			
		||||
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      void   Meooe5D       (const FermionField &in, FermionField &out);
 | 
			
		||||
      void   MeooeDag5D    (const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
@@ -133,23 +133,23 @@ namespace Grid {
 | 
			
		||||
      RealD mass;
 | 
			
		||||
 | 
			
		||||
      // Cayley form Moebius (tanh and zolotarev)
 | 
			
		||||
      std::vector<Coeff_t> omega; 
 | 
			
		||||
      std::vector<Coeff_t> omega;
 | 
			
		||||
      std::vector<Coeff_t> bs;    // S dependent coeffs
 | 
			
		||||
      std::vector<Coeff_t> cs;    
 | 
			
		||||
      std::vector<Coeff_t> as;    
 | 
			
		||||
      std::vector<Coeff_t> cs;
 | 
			
		||||
      std::vector<Coeff_t> as;
 | 
			
		||||
      // For preconditioning Cayley form
 | 
			
		||||
      std::vector<Coeff_t> bee;    
 | 
			
		||||
      std::vector<Coeff_t> cee;    
 | 
			
		||||
      std::vector<Coeff_t> aee;    
 | 
			
		||||
      std::vector<Coeff_t> beo;    
 | 
			
		||||
      std::vector<Coeff_t> ceo;    
 | 
			
		||||
      std::vector<Coeff_t> aeo;    
 | 
			
		||||
      std::vector<Coeff_t> bee;
 | 
			
		||||
      std::vector<Coeff_t> cee;
 | 
			
		||||
      std::vector<Coeff_t> aee;
 | 
			
		||||
      std::vector<Coeff_t> beo;
 | 
			
		||||
      std::vector<Coeff_t> ceo;
 | 
			
		||||
      std::vector<Coeff_t> aeo;
 | 
			
		||||
      // LDU factorisation of the eeoo matrix
 | 
			
		||||
      std::vector<Coeff_t> lee;    
 | 
			
		||||
      std::vector<Coeff_t> leem;    
 | 
			
		||||
      std::vector<Coeff_t> uee;    
 | 
			
		||||
      std::vector<Coeff_t> ueem;    
 | 
			
		||||
      std::vector<Coeff_t> dee;    
 | 
			
		||||
      std::vector<Coeff_t> lee;
 | 
			
		||||
      std::vector<Coeff_t> leem;
 | 
			
		||||
      std::vector<Coeff_t> uee;
 | 
			
		||||
      std::vector<Coeff_t> ueem;
 | 
			
		||||
      std::vector<Coeff_t> dee;
 | 
			
		||||
 | 
			
		||||
      // Matrices of 5d ee inverse params
 | 
			
		||||
      Vector<iSinglet<Simd> >  MatpInv;
 | 
			
		||||
@@ -165,7 +165,7 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
     void CayleyReport(void);
 | 
			
		||||
     void CayleyZeroCounters(void);
 | 
			
		||||
@@ -179,9 +179,9 @@ namespace Grid {
 | 
			
		||||
     double MooeeInvTime;
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										438
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										438
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,438 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
 | 
			
		||||
      GaugeField            &_Umu,
 | 
			
		||||
      GridCartesian         &FiveDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
      GridCartesian         &FourDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
      RealD _mq1, RealD _mq2, RealD _mq3,
 | 
			
		||||
      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
 | 
			
		||||
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
 | 
			
		||||
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
 | 
			
		||||
        _shift, _pm, _M5, 1.0, 0.0, p)
 | 
			
		||||
    {
 | 
			
		||||
        RealD eps = 1.0;
 | 
			
		||||
        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
 | 
			
		||||
        assert(zdata->n == this->Ls);
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
 | 
			
		||||
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
 | 
			
		||||
 | 
			
		||||
        Approx::zolotarev_free(zdata);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /***************************************************************
 | 
			
		||||
     * Additional EOFA operators only called outside the inverter.
 | 
			
		||||
     * Since speed is not essential, simple axpby-style
 | 
			
		||||
     * implementations should be fine.
 | 
			
		||||
     ***************************************************************/
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        Din = zero;
 | 
			
		||||
        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
 | 
			
		||||
        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
 | 
			
		||||
        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
 | 
			
		||||
        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // This is just the identity for DWF
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
 | 
			
		||||
 | 
			
		||||
    // This is just the identity for DWF
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
 | 
			
		||||
 | 
			
		||||
    /*****************************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
        this->Meooe5D(psi, Din);
 | 
			
		||||
        this->DW(Din, chi, DaggerNo);
 | 
			
		||||
        axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
        this->M5D(psi, chi);
 | 
			
		||||
        return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
        this->DW(psi, Din, DaggerYes);
 | 
			
		||||
        this->MeooeDag5D(Din, chi);
 | 
			
		||||
        this->M5Ddag(psi, chi);
 | 
			
		||||
        axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
        return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /********************************************************************
 | 
			
		||||
     * Performance critical fermion operators called inside the inverter
 | 
			
		||||
     ********************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
 | 
			
		||||
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
 | 
			
		||||
        Coeff_t shiftp(0.0), shiftm(0.0);
 | 
			
		||||
        if(shift != 0.0){
 | 
			
		||||
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
 | 
			
		||||
          else{ shiftm = -shift*(mq3-mq2); }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
 | 
			
		||||
            for(int i=0; i<diag.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<upper.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<lower.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        this->M5D(psi, chi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
 | 
			
		||||
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
 | 
			
		||||
        Coeff_t shiftp(0.0), shiftm(0.0);
 | 
			
		||||
        if(shift != 0.0){
 | 
			
		||||
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
 | 
			
		||||
          else{ shiftm = -shift*(mq3-mq2); }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
 | 
			
		||||
            for(int i=0; i<diag.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<upper.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<lower.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        this->M5Ddag(psi, chi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // half checkerboard operations
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
          upper[s] = -this->cee[s];
 | 
			
		||||
          lower[s] = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
        upper[Ls-1] = this->dm;
 | 
			
		||||
        lower[0]    = this->dp;
 | 
			
		||||
 | 
			
		||||
        this->M5D(psi, psi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
          upper[s] = -this->cee[s];
 | 
			
		||||
          lower[s] = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
        upper[Ls-1] = this->dp;
 | 
			
		||||
        lower[0]    = this->dm;
 | 
			
		||||
 | 
			
		||||
        this->M5Ddag(psi, psi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /****************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    //Zolo
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
 | 
			
		||||
        ////////////////////////////////////////////////////////
 | 
			
		||||
        // Constants for the preconditioned matrix Cayley form
 | 
			
		||||
        ////////////////////////////////////////////////////////
 | 
			
		||||
        this->bs.resize(Ls);
 | 
			
		||||
        this->cs.resize(Ls);
 | 
			
		||||
        this->aee.resize(Ls);
 | 
			
		||||
        this->aeo.resize(Ls);
 | 
			
		||||
        this->bee.resize(Ls);
 | 
			
		||||
        this->beo.resize(Ls);
 | 
			
		||||
        this->cee.resize(Ls);
 | 
			
		||||
        this->ceo.resize(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
          this->bee[i] = 4.0 - this->M5 + 1.0;
 | 
			
		||||
          this->cee[i] = 1.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
          this->aee[i] = this->cee[i];
 | 
			
		||||
          this->bs[i] = this->beo[i] = 1.0;
 | 
			
		||||
          this->cs[i] = this->ceo[i] = 0.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        // EOFA shift terms
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        if(pm == 1){
 | 
			
		||||
          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1];
 | 
			
		||||
        } else if(this->pm == -1) {
 | 
			
		||||
          this->dp = mq1*this->cee[0];
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
 | 
			
		||||
        } else {
 | 
			
		||||
          this->dp = mq1*this->cee[0];
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        // LDU decomposition of eeoo
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        this->dee.resize(Ls+1);
 | 
			
		||||
        this->lee.resize(Ls);
 | 
			
		||||
        this->leem.resize(Ls);
 | 
			
		||||
        this->uee.resize(Ls);
 | 
			
		||||
        this->ueem.resize(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
 | 
			
		||||
          if(i < Ls-1){
 | 
			
		||||
 | 
			
		||||
            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
 | 
			
		||||
 | 
			
		||||
            this->leem[i] = this->dm/this->bee[i];
 | 
			
		||||
            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
 | 
			
		||||
 | 
			
		||||
            this->dee[i] = this->bee[i];
 | 
			
		||||
 | 
			
		||||
            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
 | 
			
		||||
 | 
			
		||||
            this->ueem[i] = this->dp / this->bee[0];
 | 
			
		||||
            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
 | 
			
		||||
 | 
			
		||||
          } else {
 | 
			
		||||
 | 
			
		||||
            this->lee[i]  = 0.0;
 | 
			
		||||
            this->leem[i] = 0.0;
 | 
			
		||||
            this->uee[i]  = 0.0;
 | 
			
		||||
            this->ueem[i] = 0.0;
 | 
			
		||||
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        {
 | 
			
		||||
          Coeff_t delta_d = 1.0 / this->bee[0];
 | 
			
		||||
          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
 | 
			
		||||
          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
 | 
			
		||||
          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        int inv = 1;
 | 
			
		||||
        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
 | 
			
		||||
        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Recompute Cayley-form coefficients for different shift
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
 | 
			
		||||
    {
 | 
			
		||||
        this->shift = new_shift;
 | 
			
		||||
        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
 | 
			
		||||
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
 | 
			
		||||
        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        GridBase* grid = this->FermionRedBlackGrid();
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
        if(LLs == Ls){ return; } // Not vectorised in 5th direction
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            Pplus(s,s)  = this->bee[s];
 | 
			
		||||
            Pminus(s,s) = this->bee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Pplus (0,Ls-1) = this->dp;
 | 
			
		||||
        Pminus(Ls-1,0) = this->dm;
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
        Eigen::MatrixXcd PminusMat;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "Pplus:" << std::endl;
 | 
			
		||||
            for(int s=0; s<Ls; ++s){
 | 
			
		||||
                for(int ss=0; ss<Ls; ++ss){
 | 
			
		||||
                    std::cout << Pplus(s,ss) << "\t";
 | 
			
		||||
                }
 | 
			
		||||
                std::cout << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            std::cout << GridLogMessage << "Pminus:" << std::endl;
 | 
			
		||||
            for(int s=0; s<Ls; ++s){
 | 
			
		||||
                for(int ss=0; ss<Ls; ++ss){
 | 
			
		||||
                    std::cout << Pminus(s,ss) << "\t";
 | 
			
		||||
                }
 | 
			
		||||
                std::cout << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        if(inv) {
 | 
			
		||||
            PplusMat  = Pplus.inverse();
 | 
			
		||||
            PminusMat = Pminus.inverse();
 | 
			
		||||
        } else {
 | 
			
		||||
            PplusMat  = Pplus;
 | 
			
		||||
            PminusMat = Pminus;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(dag){
 | 
			
		||||
            PplusMat.adjointInPlace();
 | 
			
		||||
            PminusMat.adjointInPlace();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        typedef typename SiteHalfSpinor::scalar_type scalar_type;
 | 
			
		||||
        const int Nsimd = Simd::Nsimd();
 | 
			
		||||
        Matp.resize(Ls*LLs);
 | 
			
		||||
        Matm.resize(Ls*LLs);
 | 
			
		||||
 | 
			
		||||
        for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
        for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
            int istride = LLs;
 | 
			
		||||
            int ostride = 1;
 | 
			
		||||
            Simd Vp;
 | 
			
		||||
            Simd Vm;
 | 
			
		||||
            scalar_type *sp = (scalar_type*) &Vp;
 | 
			
		||||
            scalar_type *sm = (scalar_type*) &Vm;
 | 
			
		||||
            for(int l=0; l<Nsimd; l++){
 | 
			
		||||
                if(switcheroo<Coeff_t>::iscomplex()) {
 | 
			
		||||
                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
                } else {
 | 
			
		||||
                    // if real
 | 
			
		||||
                    scalar_type tmp;
 | 
			
		||||
                    tmp = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
                    sp[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
                    tmp = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
                    sm[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            Matp[LLs*s2+s1] = Vp;
 | 
			
		||||
            Matm[LLs*s2+s1] = Vm;
 | 
			
		||||
        }}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    FermOpTemplateInstantiate(DomainWallEOFAFermion);
 | 
			
		||||
    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										115
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
 | 
			
		||||
      // for red-black preconditioned Shamir EOFA
 | 
			
		||||
      Coeff_t dm;
 | 
			
		||||
      Coeff_t dp;
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) {};
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
 | 
			
		||||
      virtual void  Dtilde     (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // override multiply
 | 
			
		||||
      virtual RealD M          (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual RealD Mdag       (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // half checkerboard operations
 | 
			
		||||
      virtual void  Mooee      (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      virtual void   M5D       (const FermionField& psi, FermionField& chi);
 | 
			
		||||
      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
 | 
			
		||||
        RealD _M5, const ImplParams& p=ImplParams());
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_DPERP_DWF_EOFA(A)\
 | 
			
		||||
template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
 | 
			
		||||
#define DOMAIN_WALL_EOFA_DPERP_CACHE
 | 
			
		||||
#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
 | 
			
		||||
#define DOMAIN_WALL_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										248
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										248
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,248 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
 | 
			
		||||
    // Pminus fowards
 | 
			
		||||
    // Pplus  backwards..
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
            for(int s=0; s<Ls; s++){
 | 
			
		||||
                auto tmp = psi._odata[0];
 | 
			
		||||
                if(s==0) {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else if(s==(Ls-1)) {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
            auto tmp = psi._odata[0];
 | 
			
		||||
            for(int s=0; s<Ls; s++){
 | 
			
		||||
                if(s==0) {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else if(s==(Ls-1)) {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
            auto tmp1 = psi._odata[0];
 | 
			
		||||
            auto tmp2 = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
 | 
			
		||||
            // Apply (L^{\prime})^{-1}
 | 
			
		||||
            chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
            for(int s=1; s<Ls; s++){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s-1]);
 | 
			
		||||
                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // L_m^{-1}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // U_m^{-1} D^{-1}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
                spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
            spProj5m(tmp2, chi[ss+Ls-1]);
 | 
			
		||||
            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
 | 
			
		||||
 | 
			
		||||
            // Apply U^{-1}
 | 
			
		||||
            for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s+1]);
 | 
			
		||||
                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        assert(psi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> ueec(Ls);
 | 
			
		||||
        std::vector<Coeff_t> deec(Ls+1);
 | 
			
		||||
        std::vector<Coeff_t> leec(Ls);
 | 
			
		||||
        std::vector<Coeff_t> ueemc(Ls);
 | 
			
		||||
        std::vector<Coeff_t> leemc(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<ueec.size(); s++){
 | 
			
		||||
            ueec[s]  = conjugate(this->uee[s]);
 | 
			
		||||
            deec[s]  = conjugate(this->dee[s]);
 | 
			
		||||
            leec[s]  = conjugate(this->lee[s]);
 | 
			
		||||
            ueemc[s] = conjugate(this->ueem[s]);
 | 
			
		||||
            leemc[s] = conjugate(this->leem[s]);
 | 
			
		||||
        }
 | 
			
		||||
        deec[Ls] = conjugate(this->dee[Ls]);
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
            auto tmp1 = psi._odata[0];
 | 
			
		||||
            auto tmp2 = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
            // Apply (U^{\prime})^{-dagger}
 | 
			
		||||
            chi[ss] = psi[ss];
 | 
			
		||||
            for(int s=1; s<Ls; s++){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s-1]);
 | 
			
		||||
                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // U_m^{-\dagger}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
            spProj5p(tmp2, chi[ss+Ls-1]);
 | 
			
		||||
            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
 | 
			
		||||
 | 
			
		||||
            // Apply L^{-dagger}
 | 
			
		||||
            for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s+1]);
 | 
			
		||||
                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										159
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										159
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,159 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    * Dense matrix versions of routines
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
        int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        assert(Ls==LLs);
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0;s<Ls;s++){
 | 
			
		||||
            Pplus(s,s)  = this->bee[s];
 | 
			
		||||
            Pminus(s,s) = this->bee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Pplus (0,Ls-1) = this->dp;
 | 
			
		||||
        Pminus(Ls-1,0) = this->dm;
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXd PplusMat ;
 | 
			
		||||
        Eigen::MatrixXd PminusMat;
 | 
			
		||||
 | 
			
		||||
        if(inv) {
 | 
			
		||||
            PplusMat  = Pplus.inverse();
 | 
			
		||||
            PminusMat = Pminus.inverse();
 | 
			
		||||
        } else {
 | 
			
		||||
            PplusMat  = Pplus;
 | 
			
		||||
            PminusMat = Pminus;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(dag){
 | 
			
		||||
            PplusMat.adjointInPlace();
 | 
			
		||||
            PminusMat.adjointInPlace();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // For the non-vectorised s-direction this is simple
 | 
			
		||||
 | 
			
		||||
        for(auto site=0; site<vol; site++){
 | 
			
		||||
 | 
			
		||||
            SiteSpinor     SiteChi;
 | 
			
		||||
            SiteHalfSpinor SitePplus;
 | 
			
		||||
            SiteHalfSpinor SitePminus;
 | 
			
		||||
 | 
			
		||||
            for(int s1=0; s1<Ls; s1++){
 | 
			
		||||
                SiteChi = zero;
 | 
			
		||||
                for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
                    int lex2 = s2 + Ls*site;
 | 
			
		||||
                    if(PplusMat(s1,s2) != 0.0){
 | 
			
		||||
                        spProj5p(SitePplus,psi[lex2]);
 | 
			
		||||
                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
 | 
			
		||||
                    }
 | 
			
		||||
                    if(PminusMat(s1,s2) != 0.0){
 | 
			
		||||
                        spProj5m(SitePminus, psi[lex2]);
 | 
			
		||||
                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                chi[s1+Ls*site] = SiteChi*0.5;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										168
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,168 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
    // Pminus fowards
 | 
			
		||||
    // Pplus  backwards
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            if(s==0) {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
            } else if (s==(Ls-1)) {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            } else {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            if(s==0) {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
            } else if (s==(Ls-1)) {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            } else {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        Coeff_t czero(0.0);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
        // Apply (L^{\prime})^{-1}
 | 
			
		||||
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
        for(int s=1; s<Ls; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // L_m^{-1}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // U_m^{-1} D^{-1}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
 | 
			
		||||
        }
 | 
			
		||||
        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
        // Apply U^{-1}
 | 
			
		||||
        for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        Coeff_t czero(0.0);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
        // Apply (U^{\prime})^{-dagger}
 | 
			
		||||
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
        for(int s=1; s<Ls; s++){
 | 
			
		||||
            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // U_m^{-\dagger}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
 | 
			
		||||
        }
 | 
			
		||||
        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
        // Apply L^{-dagger}
 | 
			
		||||
        for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,605 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    * Dense matrix versions of routines
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
        const int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > u(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > l(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > d(LLs);
 | 
			
		||||
 | 
			
		||||
        assert(Ls/LLs == nsimd);
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // just directly address via type pun
 | 
			
		||||
        typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
        scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
        scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
        scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
        for(int o=0;o<LLs;o++){ // outer
 | 
			
		||||
        for(int i=0;i<nsimd;i++){ //inner
 | 
			
		||||
            int s  = o + i*LLs;
 | 
			
		||||
            int ss = o*nsimd + i;
 | 
			
		||||
            u_p[ss] = upper[s];
 | 
			
		||||
            l_p[ss] = lower[s];
 | 
			
		||||
            d_p[ss] = diag[s];
 | 
			
		||||
        }}
 | 
			
		||||
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        assert(Nc == 3);
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
            #if 0
 | 
			
		||||
 | 
			
		||||
                alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
                alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
                alignas(64) SiteSpinor fp;
 | 
			
		||||
                alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
                for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                    int vp = (v+1)%LLs;
 | 
			
		||||
                    int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
                    spProj5m(hp, psi[ss+vp]);
 | 
			
		||||
                    spProj5p(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
                    if (vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
                    hp = 0.5*hp;
 | 
			
		||||
                    hm = 0.5*hm;
 | 
			
		||||
 | 
			
		||||
                    spRecon5m(fp, hp);
 | 
			
		||||
                    spRecon5p(fm, hm);
 | 
			
		||||
 | 
			
		||||
                    chi[ss+v] = d[v]*phi[ss+v];
 | 
			
		||||
                    chi[ss+v] = chi[ss+v] + u[v]*fp;
 | 
			
		||||
                    chi[ss+v] = chi[ss+v] + l[v]*fm;
 | 
			
		||||
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            #else
 | 
			
		||||
 | 
			
		||||
                for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                    vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
                    int vp = (v==LLs-1) ? 0     : v+1;
 | 
			
		||||
                    int vm = (v==0)     ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
                    Simd hp_00 = psi[ss+vp]()(2)(0);
 | 
			
		||||
                    Simd hp_01 = psi[ss+vp]()(2)(1);
 | 
			
		||||
                    Simd hp_02 = psi[ss+vp]()(2)(2);
 | 
			
		||||
                    Simd hp_10 = psi[ss+vp]()(3)(0);
 | 
			
		||||
                    Simd hp_11 = psi[ss+vp]()(3)(1);
 | 
			
		||||
                    Simd hp_12 = psi[ss+vp]()(3)(2);
 | 
			
		||||
 | 
			
		||||
                    Simd hm_00 = psi[ss+vm]()(0)(0);
 | 
			
		||||
                    Simd hm_01 = psi[ss+vm]()(0)(1);
 | 
			
		||||
                    Simd hm_02 = psi[ss+vm]()(0)(2);
 | 
			
		||||
                    Simd hm_10 = psi[ss+vm]()(1)(0);
 | 
			
		||||
                    Simd hm_11 = psi[ss+vm]()(1)(1);
 | 
			
		||||
                    Simd hm_12 = psi[ss+vm]()(1)(2);
 | 
			
		||||
 | 
			
		||||
                    if(vp <= v){
 | 
			
		||||
                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    if(vm >= v){
 | 
			
		||||
                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    // Can force these to real arithmetic and save 2x.
 | 
			
		||||
                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
        int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > u(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > l(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > d(LLs);
 | 
			
		||||
 | 
			
		||||
        assert(Ls/LLs == nsimd);
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // just directly address via type pun
 | 
			
		||||
        typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
        scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
        scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
        scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
        for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
        for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
            int s  = o + i*LLs;
 | 
			
		||||
            int ss = o*nsimd + i;
 | 
			
		||||
            u_p[ss] = upper[s];
 | 
			
		||||
            l_p[ss] = lower[s];
 | 
			
		||||
            d_p[ss] = diag[s];
 | 
			
		||||
        }}
 | 
			
		||||
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
        #if 0
 | 
			
		||||
 | 
			
		||||
            alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
            alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
            alignas(64) SiteSpinor fp;
 | 
			
		||||
            alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
            for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                int vp = (v+1)%LLs;
 | 
			
		||||
                int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
                spProj5p(hp, psi[ss+vp]);
 | 
			
		||||
                spProj5m(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
                if(vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
                if(vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
                hp = hp*0.5;
 | 
			
		||||
                hm = hm*0.5;
 | 
			
		||||
                spRecon5p(fp, hp);
 | 
			
		||||
                spRecon5m(fm, hm);
 | 
			
		||||
 | 
			
		||||
                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
 | 
			
		||||
                chi[ss+v] = chi[ss+v]     +l[v]*fm;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        #else
 | 
			
		||||
 | 
			
		||||
            for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
                int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
                int vm = (v == 0    ) ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
                Simd hp_00 = psi[ss+vp]()(0)(0);
 | 
			
		||||
                Simd hp_01 = psi[ss+vp]()(0)(1);
 | 
			
		||||
                Simd hp_02 = psi[ss+vp]()(0)(2);
 | 
			
		||||
                Simd hp_10 = psi[ss+vp]()(1)(0);
 | 
			
		||||
                Simd hp_11 = psi[ss+vp]()(1)(1);
 | 
			
		||||
                Simd hp_12 = psi[ss+vp]()(1)(2);
 | 
			
		||||
 | 
			
		||||
                Simd hm_00 = psi[ss+vm]()(2)(0);
 | 
			
		||||
                Simd hm_01 = psi[ss+vm]()(2)(1);
 | 
			
		||||
                Simd hm_02 = psi[ss+vm]()(2)(2);
 | 
			
		||||
                Simd hm_10 = psi[ss+vm]()(3)(0);
 | 
			
		||||
                Simd hm_11 = psi[ss+vm]()(3)(1);
 | 
			
		||||
                Simd hm_12 = psi[ss+vm]()(3)(2);
 | 
			
		||||
 | 
			
		||||
                if (vp <= v){
 | 
			
		||||
                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                if(vm >= v){
 | 
			
		||||
                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
 | 
			
		||||
                vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
                vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
                vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef AVX512
 | 
			
		||||
        #include<simd/Intel512common.h>
 | 
			
		||||
        #include<simd/Intel512avx.h>
 | 
			
		||||
        #include<simd/Intel512single.h>
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        #ifndef AVX512
 | 
			
		||||
        {
 | 
			
		||||
            SiteHalfSpinor BcastP;
 | 
			
		||||
            SiteHalfSpinor BcastM;
 | 
			
		||||
            SiteHalfSpinor SiteChiP;
 | 
			
		||||
            SiteHalfSpinor SiteChiM;
 | 
			
		||||
 | 
			
		||||
            // Ls*Ls * 2 * 12 * vol flops
 | 
			
		||||
            for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
                for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
 | 
			
		||||
                    int s = s2 + l*LLs;
 | 
			
		||||
                    int lex = s2 + LLs*site;
 | 
			
		||||
 | 
			
		||||
                    if( s2==0 && l==0 ){
 | 
			
		||||
                        SiteChiP=zero;
 | 
			
		||||
                        SiteChiM=zero;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
 | 
			
		||||
                    }}
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
 | 
			
		||||
                    }}
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
 | 
			
		||||
                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
 | 
			
		||||
                    }}
 | 
			
		||||
                }}
 | 
			
		||||
 | 
			
		||||
                {
 | 
			
		||||
                    int lex = s1 + LLs*site;
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
 | 
			
		||||
                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
 | 
			
		||||
                    }}
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
        #else
 | 
			
		||||
        {
 | 
			
		||||
            // pointers
 | 
			
		||||
            //  MASK_REGS;
 | 
			
		||||
            #define Chi_00 %%zmm1
 | 
			
		||||
            #define Chi_01 %%zmm2
 | 
			
		||||
            #define Chi_02 %%zmm3
 | 
			
		||||
            #define Chi_10 %%zmm4
 | 
			
		||||
            #define Chi_11 %%zmm5
 | 
			
		||||
            #define Chi_12 %%zmm6
 | 
			
		||||
            #define Chi_20 %%zmm7
 | 
			
		||||
            #define Chi_21 %%zmm8
 | 
			
		||||
            #define Chi_22 %%zmm9
 | 
			
		||||
            #define Chi_30 %%zmm10
 | 
			
		||||
            #define Chi_31 %%zmm11
 | 
			
		||||
            #define Chi_32 %%zmm12
 | 
			
		||||
 | 
			
		||||
            #define BCAST0  %%zmm13
 | 
			
		||||
            #define BCAST1  %%zmm14
 | 
			
		||||
            #define BCAST2  %%zmm15
 | 
			
		||||
            #define BCAST3  %%zmm16
 | 
			
		||||
            #define BCAST4  %%zmm17
 | 
			
		||||
            #define BCAST5  %%zmm18
 | 
			
		||||
            #define BCAST6  %%zmm19
 | 
			
		||||
            #define BCAST7  %%zmm20
 | 
			
		||||
            #define BCAST8  %%zmm21
 | 
			
		||||
            #define BCAST9  %%zmm22
 | 
			
		||||
            #define BCAST10 %%zmm23
 | 
			
		||||
            #define BCAST11 %%zmm24
 | 
			
		||||
 | 
			
		||||
            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
 | 
			
		||||
            for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
                for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
 | 
			
		||||
                    int lex = s2 + LLs*site;
 | 
			
		||||
                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
 | 
			
		||||
                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
 | 
			
		||||
                    uint64_t a2 = (uint64_t) &psi[lex];
 | 
			
		||||
 | 
			
		||||
                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
                        if((s2+l)==0) {
 | 
			
		||||
                            asm(
 | 
			
		||||
                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
 | 
			
		||||
                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
 | 
			
		||||
                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
 | 
			
		||||
                                    VBCASTCDUP(0,%2,BCAST0)
 | 
			
		||||
                                    VBCASTCDUP(1,%2,BCAST1)
 | 
			
		||||
                                    VBCASTCDUP(2,%2,BCAST2)
 | 
			
		||||
                                    VBCASTCDUP(3,%2,BCAST3)
 | 
			
		||||
                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
                        } else {
 | 
			
		||||
                            asm(
 | 
			
		||||
                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
                        }
 | 
			
		||||
                        a0 = a0 + incr;
 | 
			
		||||
                        a1 = a1 + incr;
 | 
			
		||||
                        a2 = a2 + sizeof(Simd::scalar_type);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                {
 | 
			
		||||
                  int lexa = s1+LLs*site;
 | 
			
		||||
                  asm (
 | 
			
		||||
                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
 | 
			
		||||
                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
 | 
			
		||||
                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
 | 
			
		||||
                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
 | 
			
		||||
                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
 | 
			
		||||
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        #undef Chi_00
 | 
			
		||||
        #undef Chi_01
 | 
			
		||||
        #undef Chi_02
 | 
			
		||||
        #undef Chi_10
 | 
			
		||||
        #undef Chi_11
 | 
			
		||||
        #undef Chi_12
 | 
			
		||||
        #undef Chi_20
 | 
			
		||||
        #undef Chi_21
 | 
			
		||||
        #undef Chi_22
 | 
			
		||||
        #undef Chi_30
 | 
			
		||||
        #undef Chi_31
 | 
			
		||||
        #undef Chi_32
 | 
			
		||||
 | 
			
		||||
        #undef BCAST0
 | 
			
		||||
        #undef BCAST1
 | 
			
		||||
        #undef BCAST2
 | 
			
		||||
        #undef BCAST3
 | 
			
		||||
        #undef BCAST4
 | 
			
		||||
        #undef BCAST5
 | 
			
		||||
        #undef BCAST6
 | 
			
		||||
        #undef BCAST7
 | 
			
		||||
        #undef BCAST8
 | 
			
		||||
        #undef BCAST9
 | 
			
		||||
        #undef BCAST10
 | 
			
		||||
        #undef BCAST11
 | 
			
		||||
        #endif
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // Z-mobius version
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
 | 
			
		||||
        exit(-1);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
        int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > Matp;
 | 
			
		||||
        Vector<iSinglet<Simd> > Matm;
 | 
			
		||||
        Vector<iSinglet<Simd> > *_Matp;
 | 
			
		||||
        Vector<iSinglet<Simd> > *_Matm;
 | 
			
		||||
 | 
			
		||||
        //  MooeeInternalCompute(dag,inv,Matp,Matm);
 | 
			
		||||
        if(inv && dag){
 | 
			
		||||
            _Matp = &this->MatpInvDag;
 | 
			
		||||
            _Matm = &this->MatmInvDag;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(inv && (!dag)){
 | 
			
		||||
            _Matp = &this->MatpInv;
 | 
			
		||||
            _Matm = &this->MatmInv;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(!inv){
 | 
			
		||||
            MooeeInternalCompute(dag, inv, Matp, Matm);
 | 
			
		||||
            _Matp = &Matp;
 | 
			
		||||
            _Matm = &Matm;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        assert(_Matp->size() == Ls*LLs);
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        if(switcheroo<Coeff_t>::iscomplex()){
 | 
			
		||||
            parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
 | 
			
		||||
 | 
			
		||||
@@ -38,6 +38,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
// - ContinuedFractionFermion5D.cc
 | 
			
		||||
// - WilsonFermion.cc
 | 
			
		||||
// - WilsonKernels.cc
 | 
			
		||||
// - DomainWallEOFAFermion.cc
 | 
			
		||||
// - MobiusEOFAFermion.cc
 | 
			
		||||
//
 | 
			
		||||
// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
 | 
			
		||||
// for EVERY .cc file. This define centralises the list and restores global push of impl cases
 | 
			
		||||
@@ -57,8 +59,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 | 
			
		||||
@@ -121,6 +124,14 @@ typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
			
		||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
			
		||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
			
		||||
@@ -129,6 +140,14 @@ typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 | 
			
		||||
typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 | 
			
		||||
typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
			
		||||
@@ -137,7 +156,7 @@ typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
// Ls vectorised 
 | 
			
		||||
// Ls vectorised
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
 | 
			
		||||
@@ -146,6 +165,14 @@ typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
 | 
			
		||||
@@ -154,6 +181,14 @@ typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
 | 
			
		||||
@@ -214,6 +249,14 @@ typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
			
		||||
@@ -230,6 +273,14 @@ typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 | 
			
		||||
 
 | 
			
		||||
@@ -538,6 +538,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
   
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 template <class ref>
 | 
			
		||||
 inline void loadLinkElement(Simd ®, ref &memory) {
 | 
			
		||||
   reg = memory;
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
 {
 | 
			
		||||
   conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
 
 | 
			
		||||
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
{
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  DhopTotalTime -= usecond();
 | 
			
		||||
  DhopCommTime -= usecond();
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  DhopCommTime += usecond();
 | 
			
		||||
  
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=2;
 | 
			
		||||
  conformable(in._grid,FermionGrid()); // verifies full grid
 | 
			
		||||
  conformable(in._grid,out._grid);
 | 
			
		||||
 | 
			
		||||
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
 | 
			
		||||
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
  RealD NP = _FourDimGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = _FourDimGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 | 
			
		||||
	    << DhopCalls   << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 | 
			
		||||
	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 | 
			
		||||
	    << DhopCommTime    / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 | 
			
		||||
	    << DhopComputeTime / DhopCalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
  // Average the compute time
 | 
			
		||||
  _FourDimGrid->GlobalSum(DhopComputeTime);
 | 
			
		||||
  DhopComputeTime/=NP;
 | 
			
		||||
 | 
			
		||||
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls       = 0;
 | 
			
		||||
  DhopTotalTime    = 0;
 | 
			
		||||
  DhopCommTime    = 0;
 | 
			
		||||
  DhopComputeTime = 0;
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Implement the general interface. Here we use SAME mass on all slices
 | 
			
		||||
 
 | 
			
		||||
@@ -55,6 +55,16 @@ namespace QCD {
 | 
			
		||||
      FermionField _tmp;
 | 
			
		||||
      FermionField &tmp(void) { return _tmp; }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      // Performance monitoring
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      void Report(void);
 | 
			
		||||
      void ZeroCounters(void);
 | 
			
		||||
      double DhopTotalTime;
 | 
			
		||||
      double DhopCalls;
 | 
			
		||||
      double DhopCommTime;
 | 
			
		||||
      double DhopComputeTime;
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Implement the abstract base
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										502
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										502
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,502 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
 | 
			
		||||
      GaugeField            &_Umu,
 | 
			
		||||
      GridCartesian         &FiveDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
      GridCartesian         &FourDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
      RealD _mq1, RealD _mq2, RealD _mq3,
 | 
			
		||||
      RealD _shift, int _pm, RealD _M5,
 | 
			
		||||
      RealD _b, RealD _c, const ImplParams &p) :
 | 
			
		||||
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
 | 
			
		||||
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
 | 
			
		||||
        _shift, _pm, _M5, _b, _c, p)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      RealD eps = 1.0;
 | 
			
		||||
      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
 | 
			
		||||
      assert(zdata->n == this->Ls);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
 | 
			
		||||
        ",c=" << _c << ") with Ls=" << Ls << std::endl;
 | 
			
		||||
      this->SetCoefficientsTanh(zdata, _b, _c);
 | 
			
		||||
      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
 | 
			
		||||
        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
 | 
			
		||||
        ",pm=" << _pm << ")" << std::endl;
 | 
			
		||||
 | 
			
		||||
      Approx::zolotarev_free(zdata);
 | 
			
		||||
 | 
			
		||||
      if(_shift != 0.0){
 | 
			
		||||
        SetCoefficientsPrecondShiftOps();
 | 
			
		||||
      } else {
 | 
			
		||||
        Mooee_shift.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInv_shift_lc.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInv_shift_norm.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInvDag_shift_lc.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInvDag_shift_norm.resize(Ls, 0.0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /****************************************************************
 | 
			
		||||
     * Additional EOFA operators only called outside the inverter.  
 | 
			
		||||
     * Since speed is not essential, simple axpby-style
 | 
			
		||||
     * implementations should be fine.
 | 
			
		||||
     ***************************************************************/
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      RealD alpha = this->alpha;
 | 
			
		||||
 | 
			
		||||
      Din = zero;
 | 
			
		||||
      if((sign == 1) && (dag == 0)) { // \Omega_{+}
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
 | 
			
		||||
        for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
 | 
			
		||||
        for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
 | 
			
		||||
    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls    = this->Ls;
 | 
			
		||||
      RealD b   = 0.5 * ( 1.0 + this->alpha );
 | 
			
		||||
      RealD c   = 0.5 * ( 1.0 - this->alpha );
 | 
			
		||||
      RealD mq1 = this->mq1;
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
        if(s == 0) {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
 | 
			
		||||
        } else if(s == (Ls-1)) {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
 | 
			
		||||
        } else {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      RealD m = this->mq1;
 | 
			
		||||
      RealD c = 0.5 * this->alpha;
 | 
			
		||||
      RealD d = 0.5;
 | 
			
		||||
 | 
			
		||||
      RealD DtInv_p(0.0), DtInv_m(0.0);
 | 
			
		||||
      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
 | 
			
		||||
      FermionField tmp(this->FermionGrid());
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
      for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
 | 
			
		||||
        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
 | 
			
		||||
        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
 | 
			
		||||
        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
 | 
			
		||||
        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
 | 
			
		||||
 | 
			
		||||
        if(sp == 0){
 | 
			
		||||
          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
 | 
			
		||||
          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
 | 
			
		||||
        } else {
 | 
			
		||||
          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
 | 
			
		||||
          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      }}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /*****************************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
      this->Meooe5D(psi, Din);
 | 
			
		||||
      this->DW(Din, chi, DaggerNo);
 | 
			
		||||
      axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
      this->M5D(psi, chi);
 | 
			
		||||
      return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
      this->DW(psi, Din, DaggerYes);
 | 
			
		||||
      this->MeooeDag5D(Din, chi);
 | 
			
		||||
      this->M5Ddag(psi, chi);
 | 
			
		||||
      axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
      return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /********************************************************************
 | 
			
		||||
     * Performance critical fermion operators called inside the inverter
 | 
			
		||||
     ********************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // half checkerboard operations
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      // coefficients of Mooee
 | 
			
		||||
      std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        upper[s] = -this->cee[s];
 | 
			
		||||
        lower[s] = -this->cee[s];
 | 
			
		||||
      }
 | 
			
		||||
      upper[Ls-1] *= -this->mq1;
 | 
			
		||||
      lower[0]    *= -this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      // coefficients of MooeeDag
 | 
			
		||||
      std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          upper[s] = -this->cee[s+1];
 | 
			
		||||
          lower[s] = this->mq1*this->cee[Ls-1];
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          upper[s] = this->mq1*this->cee[0];
 | 
			
		||||
          lower[s] = -this->cee[s-1];
 | 
			
		||||
        } else {
 | 
			
		||||
          upper[s] = -this->cee[s+1];
 | 
			
		||||
          lower[s] = -this->cee[s-1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /****************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    // Computes coefficients for applying Cayley preconditioned shift operators
 | 
			
		||||
    //  (Mooee + \Delta) --> Mooee_shift
 | 
			
		||||
    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
 | 
			
		||||
    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
 | 
			
		||||
    // For the latter two cases, the operation takes the form
 | 
			
		||||
    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
 | 
			
		||||
    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
 | 
			
		||||
    {
 | 
			
		||||
      int   Ls    = this->Ls;
 | 
			
		||||
      int   pm    = this->pm;
 | 
			
		||||
      RealD alpha = this->alpha;
 | 
			
		||||
      RealD k     = this->k;
 | 
			
		||||
      RealD mq1   = this->mq1;
 | 
			
		||||
      RealD shift = this->shift;
 | 
			
		||||
 | 
			
		||||
      // Initialize
 | 
			
		||||
      Mooee_shift.resize(Ls);
 | 
			
		||||
      MooeeInv_shift_lc.resize(Ls);
 | 
			
		||||
      MooeeInv_shift_norm.resize(Ls);
 | 
			
		||||
      MooeeInvDag_shift_lc.resize(Ls);
 | 
			
		||||
      MooeeInvDag_shift_norm.resize(Ls);
 | 
			
		||||
 | 
			
		||||
      // Construct Mooee_shift
 | 
			
		||||
      int idx(0);
 | 
			
		||||
      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
 | 
			
		||||
                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
        idx = (pm == 1) ? (s) : (Ls-1-s);
 | 
			
		||||
        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Tridiagonal solve for MooeeInvDag_shift_lc
 | 
			
		||||
      {
 | 
			
		||||
        Coeff_t m(0.0);
 | 
			
		||||
        std::vector<Coeff_t> d = Mooee_shift;
 | 
			
		||||
        std::vector<Coeff_t> u(Ls,0.0);
 | 
			
		||||
        std::vector<Coeff_t> y(Ls,0.0);
 | 
			
		||||
        std::vector<Coeff_t> q(Ls,0.0);
 | 
			
		||||
        if(pm == 1){ u[0] = 1.0; }
 | 
			
		||||
        else{ u[Ls-1] = 1.0; }
 | 
			
		||||
 | 
			
		||||
        // Tridiagonal matrix algorithm + Sherman-Morrison formula
 | 
			
		||||
        //
 | 
			
		||||
        // We solve
 | 
			
		||||
        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
 | 
			
		||||
        // where Mooee' is the tridiagonal part of Mooee_{+}, and
 | 
			
		||||
        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
 | 
			
		||||
        // so that the outer-product u \otimes v gives the (0,Ls-1)
 | 
			
		||||
        // entry of Mooee_{+}.
 | 
			
		||||
        //
 | 
			
		||||
        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
 | 
			
		||||
        // and then construct the solution to the original system
 | 
			
		||||
        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
 | 
			
		||||
        if(pm == 1){
 | 
			
		||||
          for(int s=1; s<Ls; ++s){
 | 
			
		||||
            m = -this->cee[s] / this->bee[s-1];
 | 
			
		||||
            d[s] -= m*d[s-1];
 | 
			
		||||
            u[s] -= m*u[s-1];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
 | 
			
		||||
        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
 | 
			
		||||
        for(int s=Ls-2; s>=0; --s){
 | 
			
		||||
          if(pm == 1){
 | 
			
		||||
            y[s] = d[s] / this->bee[s];
 | 
			
		||||
            q[s] = u[s] / this->bee[s];
 | 
			
		||||
          } else {
 | 
			
		||||
            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
 | 
			
		||||
            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Construct MooeeInvDag_shift_lc
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          if(pm == 1){
 | 
			
		||||
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
 | 
			
		||||
              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
 | 
			
		||||
          } else {
 | 
			
		||||
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
 | 
			
		||||
              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Compute remaining coefficients
 | 
			
		||||
        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
 | 
			
		||||
          // MooeeInv_shift_lc
 | 
			
		||||
          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
 | 
			
		||||
          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
 | 
			
		||||
 | 
			
		||||
          // MooeeInv_shift_norm
 | 
			
		||||
          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
 | 
			
		||||
 | 
			
		||||
          // MooeeInvDag_shift_norm
 | 
			
		||||
          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
 | 
			
		||||
          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Recompute coefficients for a different value of shift constant
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
 | 
			
		||||
    {
 | 
			
		||||
      this->shift = new_shift;
 | 
			
		||||
      if(new_shift != 0.0){
 | 
			
		||||
        SetCoefficientsPrecondShiftOps();
 | 
			
		||||
      } else {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        Mooee_shift.resize(Ls,0.0);
 | 
			
		||||
        MooeeInv_shift_lc.resize(Ls,0.0);
 | 
			
		||||
        MooeeInv_shift_norm.resize(Ls,0.0);
 | 
			
		||||
        MooeeInvDag_shift_lc.resize(Ls,0.0);
 | 
			
		||||
        MooeeInvDag_shift_norm.resize(Ls,0.0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
 | 
			
		||||
      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      GridBase* grid = this->FermionRedBlackGrid();
 | 
			
		||||
      int LLs = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
      if(LLs == Ls){ return; } // Not vectorised in 5th direction
 | 
			
		||||
 | 
			
		||||
      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        Pplus(s,s)  = this->bee[s];
 | 
			
		||||
        Pminus(s,s) = this->bee[s];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      Pplus (0,Ls-1) = this->mq1*this->cee[0];
 | 
			
		||||
      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
 | 
			
		||||
 | 
			
		||||
      if(this->shift != 0.0){
 | 
			
		||||
        RealD c = 0.5 * this->alpha;
 | 
			
		||||
        RealD d = 0.5;
 | 
			
		||||
        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
 | 
			
		||||
        if(this->pm == 1) {
 | 
			
		||||
          for(int s=0; s<Ls; ++s){
 | 
			
		||||
            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
 | 
			
		||||
          }
 | 
			
		||||
        } else {
 | 
			
		||||
          for(int s=0; s<Ls; ++s){
 | 
			
		||||
            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
      Eigen::MatrixXcd PminusMat;
 | 
			
		||||
 | 
			
		||||
      if(inv) {
 | 
			
		||||
        PplusMat  = Pplus.inverse();
 | 
			
		||||
        PminusMat = Pminus.inverse();
 | 
			
		||||
      } else {
 | 
			
		||||
        PplusMat  = Pplus;
 | 
			
		||||
        PminusMat = Pminus;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      if(dag){
 | 
			
		||||
        PplusMat.adjointInPlace();
 | 
			
		||||
        PminusMat.adjointInPlace();
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      typedef typename SiteHalfSpinor::scalar_type scalar_type;
 | 
			
		||||
      const int Nsimd = Simd::Nsimd();
 | 
			
		||||
      Matp.resize(Ls*LLs);
 | 
			
		||||
      Matm.resize(Ls*LLs);
 | 
			
		||||
 | 
			
		||||
      for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
      for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
        int istride = LLs;
 | 
			
		||||
        int ostride = 1;
 | 
			
		||||
        Simd Vp;
 | 
			
		||||
        Simd Vm;
 | 
			
		||||
        scalar_type *sp = (scalar_type*) &Vp;
 | 
			
		||||
        scalar_type *sm = (scalar_type*) &Vm;
 | 
			
		||||
        for(int l=0; l<Nsimd; l++){
 | 
			
		||||
          if(switcheroo<Coeff_t>::iscomplex()) {
 | 
			
		||||
            sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
            sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
          } else {
 | 
			
		||||
            // if real
 | 
			
		||||
            scalar_type tmp;
 | 
			
		||||
            tmp = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
            sp[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
            tmp = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
            sm[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        Matp[LLs*s2+s1] = Vp;
 | 
			
		||||
        Matm[LLs*s2+s1] = Vm;
 | 
			
		||||
      }}
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(MobiusEOFAFermion);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										133
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,133 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_MOBIUS_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Shift operator coefficients for red-black preconditioned Mobius EOFA
 | 
			
		||||
      std::vector<Coeff_t> Mooee_shift;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInv_shift_lc;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInv_shift_norm;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInvDag_shift_lc;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInvDag_shift_norm;
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) {};
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
 | 
			
		||||
      virtual void  Dtilde           (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // override multiply
 | 
			
		||||
      virtual RealD M                (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual RealD Mdag             (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // half checkerboard operations
 | 
			
		||||
      virtual void  Mooee            (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      virtual void   M5D             (const FermionField& psi, FermionField& chi);
 | 
			
		||||
      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
        std::vector<Coeff_t>& shift_coeffs);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
        std::vector<Coeff_t>& shift_coeffs);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
 | 
			
		||||
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsPrecondShiftOps(void);
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
#undef  MOBIUS_EOFA_DPERP_DENSE
 | 
			
		||||
#define MOBIUS_EOFA_DPERP_CACHE
 | 
			
		||||
#undef  MOBIUS_EOFA_DPERP_LINALG
 | 
			
		||||
#define MOBIUS_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										429
									
								
								lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										429
									
								
								lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,429 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        auto tmp = psi._odata[0];
 | 
			
		||||
        if(s==0){
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
 | 
			
		||||
    std::vector<Coeff_t> &shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        auto tmp = psi._odata[0];
 | 
			
		||||
        if(s==0){
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
 | 
			
		||||
        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
 | 
			
		||||
    std::vector<Coeff_t> &shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      chi[ss+Ls-1] = zero;
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
 | 
			
		||||
        else{ spProj5m(tmp, psi._odata[ss+s]); }
 | 
			
		||||
        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (L^{\prime})^{-1}
 | 
			
		||||
      chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
        spProj5m(tmp, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-1} D^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
        spProj5p(tmp, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
 | 
			
		||||
      // Apply U^{-1}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        spProj5m(tmp, chi[ss+s+1]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp1        = psi._odata[0];
 | 
			
		||||
      auto tmp2        = psi._odata[0];
 | 
			
		||||
      auto tmp2_spProj = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
 | 
			
		||||
      chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
 | 
			
		||||
        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
 | 
			
		||||
      else{ spProj5m(tmp2_spProj, tmp2); }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-1} D^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
        spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
      spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
 | 
			
		||||
      // Apply U^{-1} and add shift term
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (U^{\prime})^{-dag}
 | 
			
		||||
      chi[ss] = psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5m(tmp, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-\dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-\dag} D^{-dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5m(tmp, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
 | 
			
		||||
      // Apply L^{-dag}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s+1]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp1        = psi._odata[0];
 | 
			
		||||
      auto tmp2        = psi._odata[0];
 | 
			
		||||
      auto tmp2_spProj = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
 | 
			
		||||
      chi[ss] = psi[ss];
 | 
			
		||||
      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
 | 
			
		||||
        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
 | 
			
		||||
      else{ spProj5m(tmp2_spProj, tmp2); }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-\dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-\dag} D^{-dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
      spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
 | 
			
		||||
      // Apply L^{-dag}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef MOBIUS_EOFA_DPERP_CACHE
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										184
									
								
								lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										184
									
								
								lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,184 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  * Dense matrix versions of routines
 | 
			
		||||
  */
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
    int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
    int pm      = this->pm;
 | 
			
		||||
    RealD shift = this->shift;
 | 
			
		||||
    RealD alpha = this->alpha;
 | 
			
		||||
    RealD k     = this->k;
 | 
			
		||||
    RealD mq1   = this->mq1;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    assert(Ls==LLs);
 | 
			
		||||
 | 
			
		||||
    Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
    Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
        Pplus(s,s)  = this->bee[s];
 | 
			
		||||
        Pminus(s,s) = this->bee[s];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
    }
 | 
			
		||||
    Pplus (0,Ls-1) = mq1*this->cee[0];
 | 
			
		||||
    Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
 | 
			
		||||
 | 
			
		||||
    if(shift != 0.0){
 | 
			
		||||
      Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
        if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
 | 
			
		||||
        else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Eigen::MatrixXd PplusMat ;
 | 
			
		||||
    Eigen::MatrixXd PminusMat;
 | 
			
		||||
 | 
			
		||||
    if(inv){
 | 
			
		||||
      PplusMat  = Pplus.inverse();
 | 
			
		||||
      PminusMat = Pminus.inverse();
 | 
			
		||||
    } else {
 | 
			
		||||
      PplusMat  = Pplus;
 | 
			
		||||
      PminusMat = Pminus;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(dag){
 | 
			
		||||
      PplusMat.adjointInPlace();
 | 
			
		||||
      PminusMat.adjointInPlace();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // For the non-vectorised s-direction this is simple
 | 
			
		||||
 | 
			
		||||
    for(auto site=0; site<vol; site++){
 | 
			
		||||
 | 
			
		||||
        SiteSpinor     SiteChi;
 | 
			
		||||
        SiteHalfSpinor SitePplus;
 | 
			
		||||
        SiteHalfSpinor SitePminus;
 | 
			
		||||
 | 
			
		||||
        for(int s1=0; s1<Ls; s1++){
 | 
			
		||||
            SiteChi = zero;
 | 
			
		||||
            for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
                int lex2 = s2 + Ls*site;
 | 
			
		||||
                if(PplusMat(s1,s2) != 0.0){
 | 
			
		||||
                    spProj5p(SitePplus,psi[lex2]);
 | 
			
		||||
                    accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
 | 
			
		||||
                }
 | 
			
		||||
                if(PminusMat(s1,s2) != 0.0){
 | 
			
		||||
                    spProj5m(SitePminus, psi[lex2]);
 | 
			
		||||
                    accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            chi[s1+Ls*site] = SiteChi*0.5;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef MOBIUS_EOFA_DPERP_DENSE
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
    template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
    template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										290
									
								
								lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,290 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
  // Pminus fowards
 | 
			
		||||
  // Pplus  backwards
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    for(int s=0; s<Ls; s++){
 | 
			
		||||
      if(s==0) {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
      } else if (s==(Ls-1)) {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      } else {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
    std::vector<Coeff_t>& shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    for(int s=0; s<Ls; s++){
 | 
			
		||||
      if(s==0) {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
      } else if (s==(Ls-1)) {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
        axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      } else {
 | 
			
		||||
        axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
 | 
			
		||||
      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    for(int s=0; s<Ls; s++){
 | 
			
		||||
      if(s==0) {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
      } else if (s==(Ls-1)) {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      } else {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
    std::vector<Coeff_t>& shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    for(int s=0; s<Ls; s++){
 | 
			
		||||
      if(s==0) {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
      } else if (s==(Ls-1)) {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      } else {
 | 
			
		||||
        axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
        axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
 | 
			
		||||
      else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    Coeff_t czero(0.0);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    // Apply (L^{\prime})^{-1}
 | 
			
		||||
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
    for(int s=1; s<Ls; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // L_m^{-1}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // U_m^{-1} D^{-1}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
    // Apply U^{-1}
 | 
			
		||||
    for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    Coeff_t czero(0.0);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
    // Apply (L^{\prime})^{-1}
 | 
			
		||||
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
    axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
 | 
			
		||||
    for(int s=1; s<Ls; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
 | 
			
		||||
      axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // L_m^{-1}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
      axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // U_m^{-1} D^{-1}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
    // Apply U^{-1} and add shift term
 | 
			
		||||
    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
 | 
			
		||||
    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
 | 
			
		||||
    for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
      axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
 | 
			
		||||
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
 | 
			
		||||
      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    Coeff_t czero(0.0);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    // Apply (U^{\prime})^{-dagger}
 | 
			
		||||
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
    for(int s=1; s<Ls; s++){
 | 
			
		||||
      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // U_m^{-\dagger}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
    // Apply L^{-dagger}
 | 
			
		||||
    for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    Coeff_t one(1.0);
 | 
			
		||||
    Coeff_t czero(0.0);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
    // Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
 | 
			
		||||
    axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
    axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
 | 
			
		||||
    for(int s=1; s<Ls; s++){
 | 
			
		||||
      axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
 | 
			
		||||
      axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // U_m^{-\dagger}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
    for(int s=0; s<Ls-1; s++){
 | 
			
		||||
      axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
 | 
			
		||||
    }
 | 
			
		||||
    axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
    // Apply L^{-dagger} and add shift
 | 
			
		||||
    if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
 | 
			
		||||
    else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
 | 
			
		||||
    for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
      axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
 | 
			
		||||
      if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
 | 
			
		||||
      else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef MOBIUS_EOFA_DPERP_LINALG
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										983
									
								
								lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										983
									
								
								lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,983 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  * Dense matrix versions of routines
 | 
			
		||||
  */
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
 | 
			
		||||
  {
 | 
			
		||||
    this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase* grid  = psi._grid;
 | 
			
		||||
    int Ls          = this->Ls;
 | 
			
		||||
    int LLs         = grid->_rdimensions[0];
 | 
			
		||||
    const int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
    Vector<iSinglet<Simd>> u(LLs);
 | 
			
		||||
    Vector<iSinglet<Simd>> l(LLs);
 | 
			
		||||
    Vector<iSinglet<Simd>> d(LLs);
 | 
			
		||||
 | 
			
		||||
    assert(Ls/LLs == nsimd);
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // just directly address via type pun
 | 
			
		||||
    typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
    scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
    scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
    scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
    for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
    for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
      int s   = o + i*LLs;
 | 
			
		||||
      int ss  = o*nsimd + i;
 | 
			
		||||
      u_p[ss] = upper[s];
 | 
			
		||||
      l_p[ss] = lower[s];
 | 
			
		||||
      d_p[ss] = diag[s];
 | 
			
		||||
    }}
 | 
			
		||||
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    assert(Nc == 3);
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
      #if 0
 | 
			
		||||
 | 
			
		||||
        alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
        alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
        alignas(64) SiteSpinor fp;
 | 
			
		||||
        alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          int vp = (v+1)%LLs;
 | 
			
		||||
          int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
          spProj5m(hp, psi[ss+vp]);
 | 
			
		||||
          spProj5p(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
          if (vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
          if (vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
          hp = 0.5*hp;
 | 
			
		||||
          hm = 0.5*hm;
 | 
			
		||||
 | 
			
		||||
          spRecon5m(fp, hp);
 | 
			
		||||
          spRecon5p(fm, hm);
 | 
			
		||||
 | 
			
		||||
          chi[ss+v] = d[v]*phi[ss+v];
 | 
			
		||||
          chi[ss+v] = chi[ss+v] + u[v]*fp;
 | 
			
		||||
          chi[ss+v] = chi[ss+v] + l[v]*fm;
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      #else
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
          int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
          int vm = (v == 0)     ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
          Simd hp_00 = psi[ss+vp]()(2)(0);
 | 
			
		||||
          Simd hp_01 = psi[ss+vp]()(2)(1);
 | 
			
		||||
          Simd hp_02 = psi[ss+vp]()(2)(2);
 | 
			
		||||
          Simd hp_10 = psi[ss+vp]()(3)(0);
 | 
			
		||||
          Simd hp_11 = psi[ss+vp]()(3)(1);
 | 
			
		||||
          Simd hp_12 = psi[ss+vp]()(3)(2);
 | 
			
		||||
 | 
			
		||||
          Simd hm_00 = psi[ss+vm]()(0)(0);
 | 
			
		||||
          Simd hm_01 = psi[ss+vm]()(0)(1);
 | 
			
		||||
          Simd hm_02 = psi[ss+vm]()(0)(2);
 | 
			
		||||
          Simd hm_10 = psi[ss+vm]()(1)(0);
 | 
			
		||||
          Simd hm_11 = psi[ss+vm]()(1)(1);
 | 
			
		||||
          Simd hm_12 = psi[ss+vm]()(1)(2);
 | 
			
		||||
 | 
			
		||||
          if(vp <= v){
 | 
			
		||||
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(vm >= v){
 | 
			
		||||
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          // Can force these to real arithmetic and save 2x.
 | 
			
		||||
          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
 | 
			
		||||
          vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      #endif
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
    std::vector<Coeff_t>& shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    #if 0
 | 
			
		||||
 | 
			
		||||
      this->M5D(psi, phi, chi, lower, diag, upper);
 | 
			
		||||
 | 
			
		||||
      // FIXME: possible gain from vectorizing shift operation as well?
 | 
			
		||||
      Coeff_t one(1.0);
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
 | 
			
		||||
        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    #else
 | 
			
		||||
 | 
			
		||||
      GridBase* grid  = psi._grid;
 | 
			
		||||
      int Ls          = this->Ls;
 | 
			
		||||
      int LLs         = grid->_rdimensions[0];
 | 
			
		||||
      const int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
      Vector<iSinglet<Simd>> u(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> l(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> d(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> s(LLs);
 | 
			
		||||
 | 
			
		||||
      assert(Ls/LLs == nsimd);
 | 
			
		||||
      assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
      chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
      // just directly address via type pun
 | 
			
		||||
      typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
      scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
      scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
      scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
      scalar_type* s_p = (scalar_type*) &s[0];
 | 
			
		||||
 | 
			
		||||
      for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
      for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
        int s   = o + i*LLs;
 | 
			
		||||
        int ss  = o*nsimd + i;
 | 
			
		||||
        u_p[ss] = upper[s];
 | 
			
		||||
        l_p[ss] = lower[s];
 | 
			
		||||
        d_p[ss] = diag[s];
 | 
			
		||||
        s_p[ss] = shift_coeffs[s];
 | 
			
		||||
      }}
 | 
			
		||||
 | 
			
		||||
      this->M5Dcalls++;
 | 
			
		||||
      this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
      assert(Nc == 3);
 | 
			
		||||
 | 
			
		||||
      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
        int vs     = (this->pm == 1) ? LLs-1 : 0;
 | 
			
		||||
        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
 | 
			
		||||
        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
 | 
			
		||||
        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
 | 
			
		||||
        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
 | 
			
		||||
        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
 | 
			
		||||
        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
          int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
          int vm = (v == 0)     ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
          Simd hp_00 = psi[ss+vp]()(2)(0);
 | 
			
		||||
          Simd hp_01 = psi[ss+vp]()(2)(1);
 | 
			
		||||
          Simd hp_02 = psi[ss+vp]()(2)(2);
 | 
			
		||||
          Simd hp_10 = psi[ss+vp]()(3)(0);
 | 
			
		||||
          Simd hp_11 = psi[ss+vp]()(3)(1);
 | 
			
		||||
          Simd hp_12 = psi[ss+vp]()(3)(2);
 | 
			
		||||
 | 
			
		||||
          Simd hm_00 = psi[ss+vm]()(0)(0);
 | 
			
		||||
          Simd hm_01 = psi[ss+vm]()(0)(1);
 | 
			
		||||
          Simd hm_02 = psi[ss+vm]()(0)(2);
 | 
			
		||||
          Simd hm_10 = psi[ss+vm]()(1)(0);
 | 
			
		||||
          Simd hm_11 = psi[ss+vm]()(1)(1);
 | 
			
		||||
          Simd hm_12 = psi[ss+vm]()(1)(2);
 | 
			
		||||
 | 
			
		||||
          if(vp <= v){
 | 
			
		||||
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(this->pm == 1 && vs <= v){
 | 
			
		||||
            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
 | 
			
		||||
            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
 | 
			
		||||
            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
 | 
			
		||||
            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
 | 
			
		||||
            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
 | 
			
		||||
            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(vm >= v){
 | 
			
		||||
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(this->pm == -1 && vs >= v){
 | 
			
		||||
            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
 | 
			
		||||
            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
 | 
			
		||||
            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
 | 
			
		||||
            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
 | 
			
		||||
            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
 | 
			
		||||
            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          // Can force these to real arithmetic and save 2x.
 | 
			
		||||
          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
 | 
			
		||||
          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
 | 
			
		||||
          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
 | 
			
		||||
          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
 | 
			
		||||
          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
 | 
			
		||||
          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
 | 
			
		||||
          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
 | 
			
		||||
          vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      this->M5Dtime += usecond();
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase* grid = psi._grid;
 | 
			
		||||
    int Ls  = this->Ls;
 | 
			
		||||
    int LLs = grid->_rdimensions[0];
 | 
			
		||||
    int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
    Vector<iSinglet<Simd>> u(LLs);
 | 
			
		||||
    Vector<iSinglet<Simd>> l(LLs);
 | 
			
		||||
    Vector<iSinglet<Simd>> d(LLs);
 | 
			
		||||
 | 
			
		||||
    assert(Ls/LLs == nsimd);
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // just directly address via type pun
 | 
			
		||||
    typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
    scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
    scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
    scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
    for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
    for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
      int s  = o + i*LLs;
 | 
			
		||||
      int ss = o*nsimd + i;
 | 
			
		||||
      u_p[ss] = upper[s];
 | 
			
		||||
      l_p[ss] = lower[s];
 | 
			
		||||
      d_p[ss] = diag[s];
 | 
			
		||||
    }}
 | 
			
		||||
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
      #if 0
 | 
			
		||||
 | 
			
		||||
        alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
        alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
        alignas(64) SiteSpinor fp;
 | 
			
		||||
        alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          int vp = (v+1)%LLs;
 | 
			
		||||
          int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
          spProj5p(hp, psi[ss+vp]);
 | 
			
		||||
          spProj5m(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
          if(vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
          if(vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
          hp = hp*0.5;
 | 
			
		||||
          hm = hm*0.5;
 | 
			
		||||
          spRecon5p(fp, hp);
 | 
			
		||||
          spRecon5m(fm, hm);
 | 
			
		||||
 | 
			
		||||
          chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
 | 
			
		||||
          chi[ss+v] = chi[ss+v]     +l[v]*fm;
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      #else
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
          int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
          int vm = (v == 0    ) ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
          Simd hp_00 = psi[ss+vp]()(0)(0);
 | 
			
		||||
          Simd hp_01 = psi[ss+vp]()(0)(1);
 | 
			
		||||
          Simd hp_02 = psi[ss+vp]()(0)(2);
 | 
			
		||||
          Simd hp_10 = psi[ss+vp]()(1)(0);
 | 
			
		||||
          Simd hp_11 = psi[ss+vp]()(1)(1);
 | 
			
		||||
          Simd hp_12 = psi[ss+vp]()(1)(2);
 | 
			
		||||
 | 
			
		||||
          Simd hm_00 = psi[ss+vm]()(2)(0);
 | 
			
		||||
          Simd hm_01 = psi[ss+vm]()(2)(1);
 | 
			
		||||
          Simd hm_02 = psi[ss+vm]()(2)(2);
 | 
			
		||||
          Simd hm_10 = psi[ss+vm]()(3)(0);
 | 
			
		||||
          Simd hm_11 = psi[ss+vm]()(3)(1);
 | 
			
		||||
          Simd hm_12 = psi[ss+vm]()(3)(2);
 | 
			
		||||
 | 
			
		||||
          if (vp <= v){
 | 
			
		||||
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(vm >= v){
 | 
			
		||||
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
          Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
          Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
          Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
          Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
          Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
          Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
          Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
          Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
          Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
          Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
          Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
 | 
			
		||||
          vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      #endif
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
    FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
    std::vector<Coeff_t>& shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    #if 0
 | 
			
		||||
 | 
			
		||||
      this->M5Ddag(psi, phi, chi, lower, diag, upper);
 | 
			
		||||
 | 
			
		||||
      // FIXME: possible gain from vectorizing shift operation as well?
 | 
			
		||||
      Coeff_t one(1.0);
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
 | 
			
		||||
        else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    #else
 | 
			
		||||
 | 
			
		||||
      GridBase* grid = psi._grid;
 | 
			
		||||
      int Ls  = this->Ls;
 | 
			
		||||
      int LLs = grid->_rdimensions[0];
 | 
			
		||||
      int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
      Vector<iSinglet<Simd>> u(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> l(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> d(LLs);
 | 
			
		||||
      Vector<iSinglet<Simd>> s(LLs);
 | 
			
		||||
 | 
			
		||||
      assert(Ls/LLs == nsimd);
 | 
			
		||||
      assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
      chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
      // just directly address via type pun
 | 
			
		||||
      typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
      scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
      scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
      scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
      scalar_type* s_p = (scalar_type*) &s[0];
 | 
			
		||||
 | 
			
		||||
      for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
      for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
        int s  = o + i*LLs;
 | 
			
		||||
        int ss = o*nsimd + i;
 | 
			
		||||
        u_p[ss] = upper[s];
 | 
			
		||||
        l_p[ss] = lower[s];
 | 
			
		||||
        d_p[ss] = diag[s];
 | 
			
		||||
        s_p[ss] = shift_coeffs[s];
 | 
			
		||||
      }}
 | 
			
		||||
 | 
			
		||||
      this->M5Dcalls++;
 | 
			
		||||
      this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
      parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
        int vs     = (this->pm == 1) ? LLs-1 : 0;
 | 
			
		||||
        Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
 | 
			
		||||
        Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
 | 
			
		||||
        Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
 | 
			
		||||
        Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
 | 
			
		||||
        Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
 | 
			
		||||
        Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
 | 
			
		||||
 | 
			
		||||
        for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
          vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
          int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
          int vm = (v == 0    ) ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
          Simd hp_00 = psi[ss+vp]()(0)(0);
 | 
			
		||||
          Simd hp_01 = psi[ss+vp]()(0)(1);
 | 
			
		||||
          Simd hp_02 = psi[ss+vp]()(0)(2);
 | 
			
		||||
          Simd hp_10 = psi[ss+vp]()(1)(0);
 | 
			
		||||
          Simd hp_11 = psi[ss+vp]()(1)(1);
 | 
			
		||||
          Simd hp_12 = psi[ss+vp]()(1)(2);
 | 
			
		||||
 | 
			
		||||
          Simd hm_00 = psi[ss+vm]()(2)(0);
 | 
			
		||||
          Simd hm_01 = psi[ss+vm]()(2)(1);
 | 
			
		||||
          Simd hm_02 = psi[ss+vm]()(2)(2);
 | 
			
		||||
          Simd hm_10 = psi[ss+vm]()(3)(0);
 | 
			
		||||
          Simd hm_11 = psi[ss+vm]()(3)(1);
 | 
			
		||||
          Simd hm_12 = psi[ss+vm]()(3)(2);
 | 
			
		||||
 | 
			
		||||
          if (vp <= v){
 | 
			
		||||
            hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
            hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
            hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
            hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
            hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
            hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(this->pm == 1 && vs <= v){
 | 
			
		||||
            hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
 | 
			
		||||
            hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
 | 
			
		||||
            hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
 | 
			
		||||
            hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
 | 
			
		||||
            hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
 | 
			
		||||
            hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(vm >= v){
 | 
			
		||||
            hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
            hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
            hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
            hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
            hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
            hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if(this->pm == -1 && vs >= v){
 | 
			
		||||
            hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
 | 
			
		||||
            hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
 | 
			
		||||
            hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
 | 
			
		||||
            hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
 | 
			
		||||
            hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
 | 
			
		||||
            hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
          Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
          Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
          Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
          Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
          Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
          Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
 | 
			
		||||
          Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
 | 
			
		||||
          Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
 | 
			
		||||
          Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
 | 
			
		||||
          Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
 | 
			
		||||
          Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
 | 
			
		||||
                                      : switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
 | 
			
		||||
                                                                                                 + switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
 | 
			
		||||
 | 
			
		||||
          vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
          vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
          vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
          vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
          vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      this->M5Dtime += usecond();
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef AVX512
 | 
			
		||||
    #include<simd/Intel512common.h>
 | 
			
		||||
    #include<simd/Intel512avx.h>
 | 
			
		||||
    #include<simd/Intel512single.h>
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
  {
 | 
			
		||||
    #ifndef AVX512
 | 
			
		||||
      {
 | 
			
		||||
        SiteHalfSpinor BcastP;
 | 
			
		||||
        SiteHalfSpinor BcastM;
 | 
			
		||||
        SiteHalfSpinor SiteChiP;
 | 
			
		||||
        SiteHalfSpinor SiteChiM;
 | 
			
		||||
 | 
			
		||||
        // Ls*Ls * 2 * 12 * vol flops
 | 
			
		||||
        for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
          for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
          for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
 | 
			
		||||
            int s = s2 + l*LLs;
 | 
			
		||||
            int lex = s2 + LLs*site;
 | 
			
		||||
 | 
			
		||||
            if( s2==0 && l==0 ){
 | 
			
		||||
              SiteChiP=zero;
 | 
			
		||||
              SiteChiM=zero;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            for(int sp=0; sp<2;  sp++){
 | 
			
		||||
            for(int co=0; co<Nc; co++){
 | 
			
		||||
              vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
 | 
			
		||||
            }}
 | 
			
		||||
 | 
			
		||||
            for(int sp=0; sp<2;  sp++){
 | 
			
		||||
            for(int co=0; co<Nc; co++){
 | 
			
		||||
              vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
 | 
			
		||||
            }}
 | 
			
		||||
 | 
			
		||||
            for(int sp=0; sp<2;  sp++){
 | 
			
		||||
            for(int co=0; co<Nc; co++){
 | 
			
		||||
              SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
 | 
			
		||||
              SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
 | 
			
		||||
            }}
 | 
			
		||||
          }}
 | 
			
		||||
 | 
			
		||||
          {
 | 
			
		||||
            int lex = s1 + LLs*site;
 | 
			
		||||
            for(int sp=0; sp<2;  sp++){
 | 
			
		||||
            for(int co=0; co<Nc; co++){
 | 
			
		||||
              vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
 | 
			
		||||
              vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
 | 
			
		||||
            }}
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    #else
 | 
			
		||||
      {
 | 
			
		||||
        // pointers
 | 
			
		||||
        //  MASK_REGS;
 | 
			
		||||
        #define Chi_00 %%zmm1
 | 
			
		||||
        #define Chi_01 %%zmm2
 | 
			
		||||
        #define Chi_02 %%zmm3
 | 
			
		||||
        #define Chi_10 %%zmm4
 | 
			
		||||
        #define Chi_11 %%zmm5
 | 
			
		||||
        #define Chi_12 %%zmm6
 | 
			
		||||
        #define Chi_20 %%zmm7
 | 
			
		||||
        #define Chi_21 %%zmm8
 | 
			
		||||
        #define Chi_22 %%zmm9
 | 
			
		||||
        #define Chi_30 %%zmm10
 | 
			
		||||
        #define Chi_31 %%zmm11
 | 
			
		||||
        #define Chi_32 %%zmm12
 | 
			
		||||
 | 
			
		||||
        #define BCAST0  %%zmm13
 | 
			
		||||
        #define BCAST1  %%zmm14
 | 
			
		||||
        #define BCAST2  %%zmm15
 | 
			
		||||
        #define BCAST3  %%zmm16
 | 
			
		||||
        #define BCAST4  %%zmm17
 | 
			
		||||
        #define BCAST5  %%zmm18
 | 
			
		||||
        #define BCAST6  %%zmm19
 | 
			
		||||
        #define BCAST7  %%zmm20
 | 
			
		||||
        #define BCAST8  %%zmm21
 | 
			
		||||
        #define BCAST9  %%zmm22
 | 
			
		||||
        #define BCAST10 %%zmm23
 | 
			
		||||
        #define BCAST11 %%zmm24
 | 
			
		||||
 | 
			
		||||
        int incr = LLs*LLs*sizeof(iSinglet<Simd>);
 | 
			
		||||
 | 
			
		||||
        for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
          for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
 | 
			
		||||
            int lex = s2 + LLs*site;
 | 
			
		||||
            uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
 | 
			
		||||
            uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
 | 
			
		||||
            uint64_t a2 = (uint64_t) &psi[lex];
 | 
			
		||||
 | 
			
		||||
            for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
 | 
			
		||||
              if((s2+l)==0) {
 | 
			
		||||
                asm(
 | 
			
		||||
                      VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
 | 
			
		||||
                      VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
 | 
			
		||||
                      VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
 | 
			
		||||
                      VBCASTCDUP(0,%2,BCAST0)
 | 
			
		||||
                      VBCASTCDUP(1,%2,BCAST1)
 | 
			
		||||
                      VBCASTCDUP(2,%2,BCAST2)
 | 
			
		||||
                      VBCASTCDUP(3,%2,BCAST3)
 | 
			
		||||
                      VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                      VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                      VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                      VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                      VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                      VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                      VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                      VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                      VMULMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                      VMULMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                      VMULMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                      VMULMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                      : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
              } else {
 | 
			
		||||
                asm(
 | 
			
		||||
                      VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                      VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                      VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                      VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                      VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                      VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                      VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                      VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                      VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                      VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                      VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                      VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                      : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
              }
 | 
			
		||||
 | 
			
		||||
              a0 = a0 + incr;
 | 
			
		||||
              a1 = a1 + incr;
 | 
			
		||||
              a2 = a2 + sizeof(Simd::scalar_type);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          {
 | 
			
		||||
            int lexa = s1+LLs*site;
 | 
			
		||||
            asm (
 | 
			
		||||
               VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
 | 
			
		||||
               VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
 | 
			
		||||
               VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
 | 
			
		||||
               VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
 | 
			
		||||
               : : "r" ((uint64_t)&chi[lexa]) : "memory" );
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      #undef Chi_00
 | 
			
		||||
      #undef Chi_01
 | 
			
		||||
      #undef Chi_02
 | 
			
		||||
      #undef Chi_10
 | 
			
		||||
      #undef Chi_11
 | 
			
		||||
      #undef Chi_12
 | 
			
		||||
      #undef Chi_20
 | 
			
		||||
      #undef Chi_21
 | 
			
		||||
      #undef Chi_22
 | 
			
		||||
      #undef Chi_30
 | 
			
		||||
      #undef Chi_31
 | 
			
		||||
      #undef Chi_32
 | 
			
		||||
 | 
			
		||||
      #undef BCAST0
 | 
			
		||||
      #undef BCAST1
 | 
			
		||||
      #undef BCAST2
 | 
			
		||||
      #undef BCAST3
 | 
			
		||||
      #undef BCAST4
 | 
			
		||||
      #undef BCAST5
 | 
			
		||||
      #undef BCAST6
 | 
			
		||||
      #undef BCAST7
 | 
			
		||||
      #undef BCAST8
 | 
			
		||||
      #undef BCAST9
 | 
			
		||||
      #undef BCAST10
 | 
			
		||||
      #undef BCAST11
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // Z-mobius version
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
    int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
 | 
			
		||||
    exit(-1);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls  = this->Ls;
 | 
			
		||||
    int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
    int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    Vector<iSinglet<Simd>>   Matp;
 | 
			
		||||
    Vector<iSinglet<Simd>>   Matm;
 | 
			
		||||
    Vector<iSinglet<Simd>>* _Matp;
 | 
			
		||||
    Vector<iSinglet<Simd>>* _Matm;
 | 
			
		||||
 | 
			
		||||
    //  MooeeInternalCompute(dag,inv,Matp,Matm);
 | 
			
		||||
    if(inv && dag){
 | 
			
		||||
      _Matp = &this->MatpInvDag;
 | 
			
		||||
      _Matm = &this->MatmInvDag;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(inv && (!dag)){
 | 
			
		||||
      _Matp = &this->MatpInv;
 | 
			
		||||
      _Matm = &this->MatmInv;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(!inv){
 | 
			
		||||
      MooeeInternalCompute(dag, inv, Matp, Matm);
 | 
			
		||||
      _Matp = &Matp;
 | 
			
		||||
      _Matm = &Matm;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    assert(_Matp->size() == Ls*LLs);
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    if(switcheroo<Coeff_t>::iscomplex()){
 | 
			
		||||
      parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
        MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
      }
 | 
			
		||||
    } else {
 | 
			
		||||
      parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
        MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef MOBIUS_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
 | 
			
		||||
 | 
			
		||||
    template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
    template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
@@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 | 
			
		||||
template<class vobj,class cobj>
 | 
			
		||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
 | 
			
		||||
public:
 | 
			
		||||
 | 
			
		||||
  double timer0;
 | 
			
		||||
  double timer1;
 | 
			
		||||
  double timer2;
 | 
			
		||||
  double timer3;
 | 
			
		||||
  double timer4;
 | 
			
		||||
  double timer5;
 | 
			
		||||
  double timer6;
 | 
			
		||||
  uint64_t callsi;
 | 
			
		||||
  void ZeroCountersi(void)
 | 
			
		||||
  {
 | 
			
		||||
    timer0=0;
 | 
			
		||||
    timer1=0;
 | 
			
		||||
    timer2=0;
 | 
			
		||||
    timer3=0;
 | 
			
		||||
    timer4=0;
 | 
			
		||||
    timer5=0;
 | 
			
		||||
    timer6=0;
 | 
			
		||||
    callsi=0;
 | 
			
		||||
  }
 | 
			
		||||
  void Reporti(int calls)
 | 
			
		||||
  {
 | 
			
		||||
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
 | 
			
		||||
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
 | 
			
		||||
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
 | 
			
		||||
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
 | 
			
		||||
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> same_node;
 | 
			
		||||
@@ -252,6 +278,7 @@ public:
 | 
			
		||||
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
 | 
			
		||||
    same_node(npoints)
 | 
			
		||||
  { 
 | 
			
		||||
    ZeroCountersi();
 | 
			
		||||
    surface_list.resize(0);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -261,7 +288,6 @@ public:
 | 
			
		||||
    // Here we know the distance is 1 for WilsonStencil
 | 
			
		||||
    for(int point=0;point<this->_npoints;point++){
 | 
			
		||||
      same_node[point] = this->SameNode(point);
 | 
			
		||||
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    for(int site = 0 ;site< vol4;site++){
 | 
			
		||||
@@ -282,17 +308,28 @@ public:
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
    this->HaloExchangeOptGather(source,compress);
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    // Asynchronous MPI calls multidirectional, Isend etc...
 | 
			
		||||
    //    this->CommunicateBegin(reqs);
 | 
			
		||||
    //    this->CommunicateComplete(reqs);
 | 
			
		||||
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
 | 
			
		||||
    this->Communicate();
 | 
			
		||||
    double t2=usecond(); timer1 += t2-t1;
 | 
			
		||||
    this->CommsMerge(compress);
 | 
			
		||||
    double t3=usecond(); timer2 += t3-t2;
 | 
			
		||||
    this->CommsMergeSHM(compress);
 | 
			
		||||
    double t4=usecond(); timer3 += t4-t3;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
 | 
			
		||||
  {
 | 
			
		||||
    this->Prepare();
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    this->HaloGatherOpt(source,compress);
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    timer0 += t1-t0;
 | 
			
		||||
    callsi++;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class compressor>
 | 
			
		||||
@@ -304,7 +341,9 @@ public:
 | 
			
		||||
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
 | 
			
		||||
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
 | 
			
		||||
 | 
			
		||||
    this->mpi3synctime_g-=usecond();
 | 
			
		||||
    this->_grid->StencilBarrier();
 | 
			
		||||
    this->mpi3synctime_g+=usecond();
 | 
			
		||||
 | 
			
		||||
    assert(source._grid==this->_grid);
 | 
			
		||||
    this->halogtime-=usecond();
 | 
			
		||||
@@ -323,7 +362,6 @@ public:
 | 
			
		||||
    int dag = compress.dag;
 | 
			
		||||
    int face_idx=0;
 | 
			
		||||
    if ( dag ) { 
 | 
			
		||||
      //	std::cout << " Optimised Dagger compress " <<std::endl;
 | 
			
		||||
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
 | 
			
		||||
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
 | 
			
		||||
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
 | 
			
		||||
 
 | 
			
		||||
@@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  int vol4;
 | 
			
		||||
  vol4=FourDimGrid.oSites();
 | 
			
		||||
  Stencil.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
 | 
			
		||||
  vol4=FourDimRedBlackGrid.oSites();
 | 
			
		||||
  StencilEven.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
   StencilOdd.BuildSurfaceList(LLs,vol4);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
 | 
			
		||||
                       <<" " << StencilEven.surface_list.size()<<std::endl;
 | 
			
		||||
   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
 | 
			
		||||
   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
     
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
    RealD NP = _FourDimGrid->_Nprocessors;
 | 
			
		||||
    RealD NN = _FourDimGrid->NodeCount();
 | 
			
		||||
  RealD NP     = _FourDimGrid->_Nprocessors;
 | 
			
		||||
  RealD NN     = _FourDimGrid->NodeCount();
 | 
			
		||||
  RealD volume = Ls;  
 | 
			
		||||
  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
 | 
			
		||||
  if ( DhopCalls > 0 ) {
 | 
			
		||||
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
@@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
  }
 | 
			
		||||
  if ( DhopCalls > 0){
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
 | 
			
		||||
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
  Stencil.ZeroCountersi();
 | 
			
		||||
  StencilEven.ZeroCountersi();
 | 
			
		||||
  StencilOdd.ZeroCountersi();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
@@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.HaloExchangeOptGather(in,compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > reqs;
 | 
			
		||||
 | 
			
		||||
  // Rely on async comms; start comms before merge of local data
 | 
			
		||||
  DhopCommTime-=usecond();
 | 
			
		||||
  st.CommunicateBegin(reqs);
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMergeSHM(compressor);
 | 
			
		||||
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Perhaps use omp task and region
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  double ctime=0;
 | 
			
		||||
  double ptime=0;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Ugly explicit thread mapping introduced for OPA reasons.
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
 | 
			
		||||
  { 
 | 
			
		||||
    int tid = omp_get_thread_num();
 | 
			
		||||
    int nthreads = omp_get_num_threads();
 | 
			
		||||
    int me = omp_get_thread_num();
 | 
			
		||||
    int myoff, mywork;
 | 
			
		||||
 | 
			
		||||
    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
 | 
			
		||||
    int sF = LLs * myoff;
 | 
			
		||||
 | 
			
		||||
    if ( me == 0 ) {
 | 
			
		||||
      st.CommunicateComplete(reqs);
 | 
			
		||||
      DhopCommTime+=usecond();
 | 
			
		||||
    } else { 
 | 
			
		||||
      // Interior links in stencil
 | 
			
		||||
      if ( me==1 ) DhopComputeTime-=usecond();
 | 
			
		||||
      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
 | 
			
		||||
      if ( me==1 ) DhopComputeTime+=usecond();
 | 
			
		||||
    int ncomms = CartesianCommunicator::nCommThreads;
 | 
			
		||||
    if (ncomms == -1) ncomms = 1;
 | 
			
		||||
    assert(nthreads > ncomms);
 | 
			
		||||
    if (tid >= ncomms) {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      nthreads -= ncomms;
 | 
			
		||||
      int ttid = tid - ncomms;
 | 
			
		||||
      int n = U._grid->oSites();
 | 
			
		||||
      int chunk = n / nthreads;
 | 
			
		||||
      int rem = n % nthreads;
 | 
			
		||||
      int myblock, myn;
 | 
			
		||||
      if (ttid < rem) {
 | 
			
		||||
	myblock = ttid * chunk + ttid;
 | 
			
		||||
	myn = chunk+1;
 | 
			
		||||
      } else {
 | 
			
		||||
	myblock = ttid*chunk + rem;
 | 
			
		||||
	myn = chunk;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // do the compute
 | 
			
		||||
      if (dag == DaggerYes) {
 | 
			
		||||
	for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  int sU = ss;
 | 
			
		||||
	  int sF = LLs * sU;
 | 
			
		||||
	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
	}
 | 
			
		||||
      } else {
 | 
			
		||||
	for (int ss = myblock; ss < myblock+myn; ++ss) {
 | 
			
		||||
	  int sU = ss;
 | 
			
		||||
	  int sF = LLs * sU;
 | 
			
		||||
	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
	ptime = usecond() - start;
 | 
			
		||||
    }
 | 
			
		||||
    {
 | 
			
		||||
      double start = usecond();
 | 
			
		||||
      st.CommunicateThreaded();
 | 
			
		||||
      ctime = usecond() - start;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopCommTime += ctime;
 | 
			
		||||
  DhopComputeTime+=ptime;
 | 
			
		||||
 | 
			
		||||
  // First to enter, last to leave timing
 | 
			
		||||
  st.CollateThreads();
 | 
			
		||||
 | 
			
		||||
  DhopFaceTime-=usecond();
 | 
			
		||||
  st.CommsMerge(compressor);
 | 
			
		||||
  DhopFaceTime+=usecond();
 | 
			
		||||
 | 
			
		||||
  // Load imbalance alert. Should use dynamic schedule OMP for loop
 | 
			
		||||
  // Perhaps create a list of only those sites with face work, and 
 | 
			
		||||
  // load balance process the list.
 | 
			
		||||
  DhopComputeTime2-=usecond();
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    int sz=st.surface_list.size();
 | 
			
		||||
@@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 | 
			
		||||
#else 
 | 
			
		||||
  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
					 DoubledGaugeField & U,
 | 
			
		||||
 
 | 
			
		||||
@@ -30,60 +30,181 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#define REGISTER
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU \
 | 
			
		||||
  {const SiteSpinor & ref (in._odata[offset]);	\
 | 
			
		||||
    Chimu_00=ref()(0)(0);\
 | 
			
		||||
    Chimu_01=ref()(0)(1);\
 | 
			
		||||
    Chimu_02=ref()(0)(2);\
 | 
			
		||||
    Chimu_10=ref()(1)(0);\
 | 
			
		||||
    Chimu_11=ref()(1)(1);\
 | 
			
		||||
    Chimu_12=ref()(1)(2);\
 | 
			
		||||
    Chimu_20=ref()(2)(0);\
 | 
			
		||||
    Chimu_21=ref()(2)(1);\
 | 
			
		||||
    Chimu_22=ref()(2)(2);\
 | 
			
		||||
    Chimu_30=ref()(3)(0);\
 | 
			
		||||
    Chimu_31=ref()(3)(1);\
 | 
			
		||||
    Chimu_32=ref()(3)(2);}
 | 
			
		||||
#define LOAD_CHIMU_BODY(F)			\
 | 
			
		||||
  Chimu_00=ref(F)(0)(0);			\
 | 
			
		||||
  Chimu_01=ref(F)(0)(1);			\
 | 
			
		||||
  Chimu_02=ref(F)(0)(2);			\
 | 
			
		||||
  Chimu_10=ref(F)(1)(0);			\
 | 
			
		||||
  Chimu_11=ref(F)(1)(1);			\
 | 
			
		||||
  Chimu_12=ref(F)(1)(2);			\
 | 
			
		||||
  Chimu_20=ref(F)(2)(0);			\
 | 
			
		||||
  Chimu_21=ref(F)(2)(1);			\
 | 
			
		||||
  Chimu_22=ref(F)(2)(2);			\
 | 
			
		||||
  Chimu_30=ref(F)(3)(0);			\
 | 
			
		||||
  Chimu_31=ref(F)(3)(1);			\
 | 
			
		||||
  Chimu_32=ref(F)(3)(2)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI\
 | 
			
		||||
  {const SiteHalfSpinor &ref(buf[offset]);	\
 | 
			
		||||
    Chi_00 = ref()(0)(0);\
 | 
			
		||||
    Chi_01 = ref()(0)(1);\
 | 
			
		||||
    Chi_02 = ref()(0)(2);\
 | 
			
		||||
    Chi_10 = ref()(1)(0);\
 | 
			
		||||
    Chi_11 = ref()(1)(1);\
 | 
			
		||||
    Chi_12 = ref()(1)(2);}
 | 
			
		||||
#define LOAD_CHIMU(DIR,F,PERM)						\
 | 
			
		||||
  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_BODY(F)				\
 | 
			
		||||
    Chi_00 = ref(F)(0)(0);\
 | 
			
		||||
    Chi_01 = ref(F)(0)(1);\
 | 
			
		||||
    Chi_02 = ref(F)(0)(2);\
 | 
			
		||||
    Chi_10 = ref(F)(1)(0);\
 | 
			
		||||
    Chi_11 = ref(F)(1)(1);\
 | 
			
		||||
    Chi_12 = ref(F)(1)(2)
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI(DIR,F,PERM)					\
 | 
			
		||||
  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//G-parity implementations using in-place intrinsic ops
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
//0h,1l -> 1l,0h
 | 
			
		||||
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
 | 
			
		||||
//Pulled fermion through forwards face, GPBC on upper component
 | 
			
		||||
//Need 0= 0l 1h   1= 1l 0h
 | 
			
		||||
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
 | 
			
		||||
//Pulled fermion through backwards face, GPBC on lower component
 | 
			
		||||
//Need 0= 1l 0h   1= 0l 1h
 | 
			
		||||
 | 
			
		||||
//1l 1h -> 1h 1l
 | 
			
		||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
 | 
			
		||||
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(1)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
//0l 0h -> 0h 0l
 | 
			
		||||
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
 | 
			
		||||
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
 | 
			
		||||
  permute##PERM(tmp1, ref(0)(S)(C));				\
 | 
			
		||||
  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
 | 
			
		||||
  INTO = tmp2;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_SETUP(DIR,F)						\
 | 
			
		||||
  g = F;								\
 | 
			
		||||
  direction = st._directions[DIR];				\
 | 
			
		||||
  distance = st._distances[DIR];				\
 | 
			
		||||
  sl = st._grid->_simd_layout[direction];			\
 | 
			
		||||
  inplace_twist = 0;						\
 | 
			
		||||
  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
 | 
			
		||||
    if(sl == 1){							\
 | 
			
		||||
      g = (F+1) % 2;							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      inplace_twist = 1;						\
 | 
			
		||||
    }									\
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
 | 
			
		||||
  { const SiteSpinor &ref(in._odata[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHIMU_BODY(g);						\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
 | 
			
		||||
      } \
 | 
			
		||||
    } \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
 | 
			
		||||
  { const SiteHalfSpinor &ref(buf[offset]);				\
 | 
			
		||||
    LOAD_CHI_SETUP(DIR,F);						\
 | 
			
		||||
    if(!inplace_twist){							\
 | 
			
		||||
      LOAD_CHI_BODY(g);							\
 | 
			
		||||
    }else{								\
 | 
			
		||||
      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 | 
			
		||||
	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }else{								\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
 | 
			
		||||
	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
 | 
			
		||||
      }									\
 | 
			
		||||
    }									\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN(A)\
 | 
			
		||||
  {auto & ref(U._odata[sU](A));			\
 | 
			
		||||
   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
    UChi_00 = U_00*Chi_00;\
 | 
			
		||||
    UChi_10 = U_00*Chi_10;\
 | 
			
		||||
    UChi_01 = U_10*Chi_00;\
 | 
			
		||||
    UChi_11 = U_10*Chi_10;\
 | 
			
		||||
    UChi_02 = U_20*Chi_00;\
 | 
			
		||||
    UChi_12 = U_20*Chi_10;\
 | 
			
		||||
    UChi_00+= U_01*Chi_01;\
 | 
			
		||||
    UChi_10+= U_01*Chi_11;\
 | 
			
		||||
    UChi_01+= U_11*Chi_01;\
 | 
			
		||||
    UChi_11+= U_11*Chi_11;\
 | 
			
		||||
    UChi_02+= U_21*Chi_01;\
 | 
			
		||||
    UChi_12+= U_21*Chi_11;\
 | 
			
		||||
    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
    UChi_00+= U_00*Chi_02;\
 | 
			
		||||
    UChi_10+= U_00*Chi_12;\
 | 
			
		||||
    UChi_01+= U_10*Chi_02;\
 | 
			
		||||
    UChi_11+= U_10*Chi_12;\
 | 
			
		||||
    UChi_02+= U_20*Chi_02;\
 | 
			
		||||
    UChi_12+= U_20*Chi_12;}
 | 
			
		||||
#define MULT_2SPIN_BODY \
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
  Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
  Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
  UChi_00 = U_00*Chi_00;			\
 | 
			
		||||
  UChi_10 = U_00*Chi_10;			\
 | 
			
		||||
  UChi_01 = U_10*Chi_00;			\
 | 
			
		||||
  UChi_11 = U_10*Chi_10;			\
 | 
			
		||||
  UChi_02 = U_20*Chi_00;			\
 | 
			
		||||
  UChi_12 = U_20*Chi_10;			\
 | 
			
		||||
  UChi_00+= U_01*Chi_01;			\
 | 
			
		||||
  UChi_10+= U_01*Chi_11;			\
 | 
			
		||||
  UChi_01+= U_11*Chi_01;			\
 | 
			
		||||
  UChi_11+= U_11*Chi_11;			\
 | 
			
		||||
  UChi_02+= U_21*Chi_01;			\
 | 
			
		||||
  UChi_12+= U_21*Chi_11;			\
 | 
			
		||||
  Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
  Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
  UChi_00+= U_00*Chi_02;			\
 | 
			
		||||
  UChi_10+= U_00*Chi_12;			\
 | 
			
		||||
  UChi_01+= U_10*Chi_02;			\
 | 
			
		||||
  UChi_11+= U_10*Chi_12;			\
 | 
			
		||||
  UChi_02+= U_20*Chi_02;			\
 | 
			
		||||
  UChi_12+= U_20*Chi_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN(A,F)					\
 | 
			
		||||
  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_GPARITY(A,F)				\
 | 
			
		||||
  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
@@ -307,84 +428,87 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  result_31-= UChi_11;	\
 | 
			
		||||
  result_32-= UChi_12;
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else {					\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
  }						\
 | 
			
		||||
  MULT_2SPIN(DIR);				\
 | 
			
		||||
  MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
  RECON;					
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if ( local ) {				\
 | 
			
		||||
    LOAD_CHIMU;					\
 | 
			
		||||
    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    PROJ;					\
 | 
			
		||||
    if ( perm) {				\
 | 
			
		||||
      PERMUTE_DIR(PERM);			\
 | 
			
		||||
    }						\
 | 
			
		||||
  } else if ( st.same_node[DIR] ) {		\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
  }						\
 | 
			
		||||
  if (local || st.same_node[DIR] ) {		\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
 | 
			
		||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
 | 
			
		||||
  SE=st.GetEntry(ptype,DIR,ss);			\
 | 
			
		||||
  offset = SE->_offset;				\
 | 
			
		||||
  local  = SE->_is_local;			\
 | 
			
		||||
  perm   = SE->_permute;			\
 | 
			
		||||
  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
 | 
			
		||||
    LOAD_CHI;					\
 | 
			
		||||
    MULT_2SPIN(DIR);				\
 | 
			
		||||
    LOAD_CHI_IMPL(DIR,F,PERM);			\
 | 
			
		||||
    MULT_2SPIN_IMPL(DIR,F);			\
 | 
			
		||||
    RECON;					\
 | 
			
		||||
    nmu++;					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT(ss)				\
 | 
			
		||||
#define HAND_RESULT(ss,F)			\
 | 
			
		||||
  {						\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    vstream(ref()(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref()(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref()(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref()(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref()(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref()(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref()(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref()(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref()(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref()(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref()(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref()(3)(2),result_32);		\
 | 
			
		||||
    vstream(ref(F)(0)(0),result_00);		\
 | 
			
		||||
    vstream(ref(F)(0)(1),result_01);		\
 | 
			
		||||
    vstream(ref(F)(0)(2),result_02);		\
 | 
			
		||||
    vstream(ref(F)(1)(0),result_10);		\
 | 
			
		||||
    vstream(ref(F)(1)(1),result_11);		\
 | 
			
		||||
    vstream(ref(F)(1)(2),result_12);		\
 | 
			
		||||
    vstream(ref(F)(2)(0),result_20);		\
 | 
			
		||||
    vstream(ref(F)(2)(1),result_21);		\
 | 
			
		||||
    vstream(ref(F)(2)(2),result_22);		\
 | 
			
		||||
    vstream(ref(F)(3)(0),result_30);		\
 | 
			
		||||
    vstream(ref(F)(3)(1),result_31);		\
 | 
			
		||||
    vstream(ref(F)(3)(2),result_32);		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define HAND_RESULT_EXT(ss)			\
 | 
			
		||||
#define HAND_RESULT_EXT(ss,F)			\
 | 
			
		||||
  if (nmu){					\
 | 
			
		||||
    SiteSpinor & ref (out._odata[ss]);		\
 | 
			
		||||
    ref()(0)(0)+=result_00;		\
 | 
			
		||||
    ref()(0)(1)+=result_01;		\
 | 
			
		||||
    ref()(0)(2)+=result_02;		\
 | 
			
		||||
    ref()(1)(0)+=result_10;		\
 | 
			
		||||
    ref()(1)(1)+=result_11;		\
 | 
			
		||||
    ref()(1)(2)+=result_12;		\
 | 
			
		||||
    ref()(2)(0)+=result_20;		\
 | 
			
		||||
    ref()(2)(1)+=result_21;		\
 | 
			
		||||
    ref()(2)(2)+=result_22;		\
 | 
			
		||||
    ref()(3)(0)+=result_30;		\
 | 
			
		||||
    ref()(3)(1)+=result_31;		\
 | 
			
		||||
    ref()(3)(2)+=result_32;		\
 | 
			
		||||
    ref(F)(0)(0)+=result_00;		\
 | 
			
		||||
    ref(F)(0)(1)+=result_01;		\
 | 
			
		||||
    ref(F)(0)(2)+=result_02;		\
 | 
			
		||||
    ref(F)(1)(0)+=result_10;		\
 | 
			
		||||
    ref(F)(1)(1)+=result_11;		\
 | 
			
		||||
    ref(F)(1)(2)+=result_12;		\
 | 
			
		||||
    ref(F)(2)(0)+=result_20;		\
 | 
			
		||||
    ref(F)(2)(1)+=result_21;		\
 | 
			
		||||
    ref(F)(2)(2)+=result_22;		\
 | 
			
		||||
    ref(F)(3)(0)+=result_30;		\
 | 
			
		||||
    ref(F)(3)(1)+=result_31;		\
 | 
			
		||||
    ref(F)(3)(2)+=result_32;		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -463,15 +587,18 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -485,16 +612,19 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
@@ -509,16 +639,20 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 | 
			
		||||
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -532,16 +666,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
 | 
			
		||||
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT(ss);
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
 | 
			
		||||
  ZERO_RESULT;							\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
 | 
			
		||||
  HAND_RESULT(ss,F)
 | 
			
		||||
  
 | 
			
		||||
  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl> void 
 | 
			
		||||
@@ -557,16 +695,20 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT_EXT(ss);
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -581,16 +723,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 | 
			
		||||
  StencilEntry *SE;
 | 
			
		||||
  int offset,local,perm, ptype;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  ZERO_RESULT;
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
 | 
			
		||||
  HAND_RESULT_EXT(ss);
 | 
			
		||||
 | 
			
		||||
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
 | 
			
		||||
  ZERO_RESULT; \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
 | 
			
		||||
  HAND_RESULT_EXT(ss,F)
 | 
			
		||||
 | 
			
		||||
  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
@@ -646,11 +792,124 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 | 
			
		||||
				    const FermionField &in,		\
 | 
			
		||||
				    FermionField &out){ assert(0); }	\
 | 
			
		||||
 | 
			
		||||
  HAND_SPECIALISE_EMPTY(GparityWilsonImplF);
 | 
			
		||||
  HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
 | 
			
		||||
  HAND_SPECIALISE_EMPTY(GparityWilsonImplFH);
 | 
			
		||||
  HAND_SPECIALISE_EMPTY(GparityWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define HAND_SPECIALISE_GPARITY(IMPL)					\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
				    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
					    int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
									\
 | 
			
		||||
  template<> void							\
 | 
			
		||||
  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
 | 
			
		||||
						     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }									\
 | 
			
		||||
  template<>								\
 | 
			
		||||
  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
 | 
			
		||||
							     int ss,int sU,const FermionField &in, FermionField &out) \
 | 
			
		||||
  {									\
 | 
			
		||||
    typedef IMPL Impl;							\
 | 
			
		||||
    typedef typename Simd::scalar_type S;				\
 | 
			
		||||
    typedef typename Simd::vector_type V;				\
 | 
			
		||||
									\
 | 
			
		||||
    HAND_DECLARATIONS(ignore);						\
 | 
			
		||||
									\
 | 
			
		||||
    StencilEntry *SE;							\
 | 
			
		||||
    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
 | 
			
		||||
    int nmu=0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
    nmu = 0;								\
 | 
			
		||||
    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
 | 
			
		||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user