More re-import of Mobius EOFA

2025-08-23 06:27:09 +01:00 · 2017-08-17 19:28:53 -04:00
parent e140b3f802
commit ac9e6b63c0
2 changed files with 261 additions and 229 deletions
--- a/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermion.cc
@@ -347,9 +347,7 @@ namespace QCD {
        GridBase* grid = this->FermionRedBlackGrid();
        int LLs = grid->_rdimensions[0];

-        if(LLs == Ls){
-            return; // Not vectorised in 5th direction
-        }
+        if(LLs == Ls){ return; } // Not vectorised in 5th direction

        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
--- a/lib/qcd/action/fermion/MobiusEOFAFermion.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermion.cc
@@ -133,7 +133,36 @@ namespace QCD {
    }

    template<class Impl>
-    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ }
+    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
+    {
+      int Ls = this->Ls;
+      RealD m = this->mq1;
+      RealD c = 0.5 * this->alpha;
+      RealD d = 0.5;
+
+      RealD DtInv_p(0.0), DtInv_m(0.0);
+      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
+      FermionField tmp = zero;
+
+      for(int s=0; s<Ls; ++s){
+      for(int sp=0; sp<Ls; ++sp){
+
+        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
+        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
+        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
+        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
+
+        if(dag){
+          RealD tmp(DtInv_p);
+          DtInv_p = DtInv_m;
+          DtInv_m = tmp;
+        }
+
+        axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
+        axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
+
+      }}
+    }

    /*****************************************************************************************************/

@@ -172,298 +201,303 @@ namespace QCD {
    template<class Impl>
    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
    {
-        int Ls = this->Ls;
+      int Ls = this->Ls;

-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
-        std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
+      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;

-        // no shift term
-        if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }

-        // fused M + shift operation
-        else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
+      // fused M + shift operation
+      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
    }

    template<class Impl>
    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD shift = this->shift;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
+      int   Ls    = this->Ls;

-        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
-        Coeff_t shiftp(0.0), shiftm(0.0);
-        if(shift != 0.0){
-          if(pm == 1){ shiftp = shift*(mq3-mq2); }
-          else{ shiftm = -shift*(mq3-mq2); }
-        }
+      std::vector<Coeff_t> diag(Ls,1.0);
+      std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1 + shiftp;
+      std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = this->mq1 + shiftm;

-        std::vector<Coeff_t> diag(Ls,1.0);
-        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
-        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }

-        #if(0)
-            std::cout << GridLogMessage << "MobiusEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
-            for(int i=0; i<diag.size(); ++i){
-                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
-            }
-            for(int i=0; i<upper.size(); ++i){
-                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
-            }
-            for(int i=0; i<lower.size(); ++i){
-                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
-            }
-        #endif
-
-        this->M5Ddag(psi, chi, chi, lower, diag, upper);
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
    }

    // half checkerboard operations
    template<class Impl>
    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
    {
-        int Ls = this->Ls;
+      int Ls = this->Ls;

-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
+      // coefficients of Mooee
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        upper[s] = -this->cee[s];
+        lower[s] = -this->cee[s];
+      }
+      upper[Ls-1] *= -this->mq1;
+      lower[0]    *= -this->mq1;

-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
-        }
-        upper[Ls-1] = this->dm;
-        lower[0]    = this->dp;
+      // no shift term
+      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }

-        this->M5D(psi, psi, chi, lower, diag, upper);
+      // fused M + shift operation
+      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
    }

    template<class Impl>
    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
    {
-        int Ls = this->Ls;
+      int Ls = this->Ls;

-        std::vector<Coeff_t> diag = this->bee;
-        std::vector<Coeff_t> upper(Ls);
-        std::vector<Coeff_t> lower(Ls);
-
-        for(int s=0; s<Ls; s++){
-          upper[s] = -this->cee[s];
-          lower[s] = -this->cee[s];
+      // coefficients of MooeeDag
+      std::vector<Coeff_t> diag = this->bee;
+      std::vector<Coeff_t> upper(Ls);
+      std::vector<Coeff_t> lower(Ls);
+      for(int s=0; s<Ls; s++){
+        if(s==0) {
+          upper[s] = -this->cee[s+1];
+          lower[s] = this->mq1*this->cee[Ls-1];
+        } else if(s==(Ls-1)) {
+          upper[s] = this->mq1*this->cee[0];
+          lower[s] = -this->cee[s-1];
+        } else {
+          upper[s] = -this->cee[s+1];
+          lower[s] = -this->cee[s-1];
        }
-        upper[Ls-1] = this->dp;
-        lower[0]    = this->dm;
+      }

-        this->M5Ddag(psi, psi, chi, lower, diag, upper);
+      // no shift term
+      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
+
+      // fused M + shift operation
+      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
    }

    /****************************************************************************************/

-    //Zolo
+    // Computes coefficients for applying Cayley preconditioned shift operators
+    //  (Mooee + \Delta) --> Mooee_shift
+    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
+    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
+    // For the latter two cases, the operation takes the form
+    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
+    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
    template<class Impl>
-    void MobiusEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
+    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
    {
-        int   Ls    = this->Ls;
-        int   pm    = this->pm;
-        RealD mq1   = this->mq1;
-        RealD mq2   = this->mq2;
-        RealD mq3   = this->mq3;
-        RealD shift = this->shift;
+      int   Ls    = this->Ls;
+      int   pm    = this->pm;
+      RealD alpha = this->alpha;
+      RealD k     = this->k;
+      RealD mq1   = this->mq1;
+      RealD shift = this->shift;

-        ////////////////////////////////////////////////////////
-        // Constants for the preconditioned matrix Cayley form
-        ////////////////////////////////////////////////////////
-        this->bs.resize(Ls);
-        this->cs.resize(Ls);
-        this->aee.resize(Ls);
-        this->aeo.resize(Ls);
-        this->bee.resize(Ls);
-        this->beo.resize(Ls);
-        this->cee.resize(Ls);
-        this->ceo.resize(Ls);
+      // Initialize
+      Mooee_shift.resize(Ls);
+      MooeeInv_shift_lc.resize(Ls);
+      MooeeInv_shift_norm.resize(Ls);
+      MooeeInvDag_shift_lc.resize(Ls);
+      MooeeInvDag_shift_norm.resize(Ls);

-        for(int i=0; i<Ls; ++i){
-          this->bee[i] = 4.0 - this->M5 + 1.0;
-          this->cee[i] = 1.0;
-        }
+      // Construct Mooee_shift
+      int idx(0);
+      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
+                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
+      for(int s=0; s<Ls; ++s){
+        idx = (pm == 1) ? (s) : (Ls-1-s);
+        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
+      }

-        for(int i=0; i<Ls; ++i){
-          this->aee[i] = this->cee[i];
-          this->bs[i] = this->beo[i] = 1.0;
-          this->cs[i] = this->ceo[i] = 0.0;
-        }
+      // Tridiagonal solve for MooeeInvDag_shift_lc
+      {
+        Coeff_t m(0.0);
+        std::vector<Coeff_t> d = Mooee_shift;
+        std::vector<Coeff_t> u(Ls,0.0);
+        std::vector<Coeff_t> y(Ls,0.0);
+        std::vector<Coeff_t> q(Ls,0.0);
+        if(pm == 1){ u[0] = 1.0; }
+        else{ u[Ls-1] = 1.0; }

-        //////////////////////////////////////////
-        // EOFA shift terms
-        //////////////////////////////////////////
+        // Tridiagonal matrix algorithm + Sherman-Morrison formula
+        //
+        // We solve
+        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
+        // where Mooee' is the tridiagonal part of Mooee_{+}, and
+        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
+        // so that the outer-product u \otimes v gives the (0,Ls-1)
+        // entry of Mooee_{+}.
+        //
+        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
+        // and then construct the solution to the original system
+        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
        if(pm == 1){
-          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
-          this->dm = mq1*this->cee[Ls-1];
-        } else if(this->pm == -1) {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
-        } else {
-          this->dp = mq1*this->cee[0];
-          this->dm = mq1*this->cee[Ls-1];
+          for(int s=1; s<Ls; ++s){
+            m = -this->cee[s] / this->bee[s-1];
+            d[s] -= m*d[s-1];
+            u[s] -= m*u[s-1];
+          }
        }
-
-        //////////////////////////////////////////
-        // LDU decomposition of eeoo
-        //////////////////////////////////////////
-        this->dee.resize(Ls+1);
-        this->lee.resize(Ls);
-        this->leem.resize(Ls);
-        this->uee.resize(Ls);
-        this->ueem.resize(Ls);
-
-        for(int i=0; i<Ls; ++i){
-
-          if(i < Ls-1){
-
-            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
-
-            this->leem[i] = this->dm/this->bee[i];
-            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
-
-            this->dee[i] = this->bee[i];
-
-            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
-
-            this->ueem[i] = this->dp / this->bee[0];
-            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
-
+        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
+        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
+        for(int s=Ls-2; s>=0; --s){
+          if(pm == 1){
+            y[s] = d[s] / this->bee[s];
+            q[s] = u[s] / this->bee[s];
          } else {
-
-            this->lee[i]  = 0.0;
-            this->leem[i] = 0.0;
-            this->uee[i]  = 0.0;
-            this->ueem[i] = 0.0;
-
+            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
+            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
          }
        }

-        {
-          Coeff_t delta_d = 1.0 / this->bee[0];
-          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
-          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
-          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
+        // Construct MooeeInvDag_shift_lc
+        for(int s=0; s<Ls; ++s){
+          if(pm == 1){
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
+              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
+          } else {
+            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
+              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
+          }
        }

-        int inv = 1;
-        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
-        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
+        // Compute remaining coefficients
+        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
+        for(int s=0; s<Ls; ++s){
+
+          // MooeeInv_shift_lc
+          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
+          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
+
+          // MooeeInv_shift_norm
+          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
+
+          // MooeeInvDag_shift_norm
+          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
+            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
+        }
+      }
    }

-    // Recompute Cayley-form coefficients for different shift
+    // Recompute coefficients for a different value of shift constant
    template<class Impl>
    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
    {
-        this->shift = new_shift;
-        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
-        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
+      this->shift = new_shift;
+      if(new_shift != 0.0){
+        SetCoefficientsPrecondShiftOps();
+      } else {
+        int Ls = this->Ls;
+        Mooee_shift.resize(Ls,0.0);
+        MooeeInv_shift_lc.resize(Ls,0.0);
+        MooeeInv_shift_norm.resize(Ls,0.0);
+        MooeeInvDag_shift_lc.resize(Ls,0.0);
+        MooeeInvDag_shift_norm.resize(Ls,0.0);
+      }
    }

    template<class Impl>
    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
-        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
+      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
    {
-        int Ls = this->Ls;
+      int Ls = this->Ls;

-        GridBase* grid = this->FermionRedBlackGrid();
-        int LLs = grid->_rdimensions[0];
+      GridBase* grid = this->FermionRedBlackGrid();
+      int LLs = grid->_rdimensions[0];

-        if(LLs == Ls){
-            return; // Not vectorised in 5th direction
-        }
+      if(LLs == Ls){ return; } // Not vectorised in 5th direction

-        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);

-        for(int s=0; s<Ls; s++){
-            Pplus(s,s)  = this->bee[s];
-            Pminus(s,s) = this->bee[s];
-        }
+      for(int s=0; s<Ls; s++){
+        Pplus(s,s)  = this->bee[s];
+        Pminus(s,s) = this->bee[s];
+      }

-        for(int s=0; s<Ls-1; s++){
-            Pminus(s,s+1) = -this->cee[s];
-        }
+      for(int s=0; s<Ls-1; s++){
+        Pminus(s,s+1) = -this->cee[s];
+        Pplus(s+1,s) = -this->cee[s+1];
+      }

-        for(int s=0; s<Ls-1; s++){
-            Pplus(s+1,s) = -this->cee[s+1];
-        }
+      Pplus (0,Ls-1) = this->mq1*this->cee[0];
+      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];

-        Pplus (0,Ls-1) = this->dp;
-        Pminus(Ls-1,0) = this->dm;
-
-        Eigen::MatrixXcd PplusMat ;
-        Eigen::MatrixXcd PminusMat;
-
-        #if(0)
-            std::cout << GridLogMessage << "Pplus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pplus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << GridLogMessage << "Pminus:" << std::endl;
-            for(int s=0; s<Ls; ++s){
-                for(int ss=0; ss<Ls; ++ss){
-                    std::cout << Pminus(s,ss) << "\t";
-                }
-                std::cout << std::endl;
-            }
-        #endif
-
-        if(inv) {
-            PplusMat  = Pplus.inverse();
-            PminusMat = Pminus.inverse();
+      if(this->shift != 0.0){
+        RealD c = 0.5 * this->alpha;
+        RealD d = 0.5;
+        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
+        if(this->pm == 1) {
+          for(int s=0; s<Ls; ++s){
+            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
+          }
        } else {
-            PplusMat  = Pplus;
-            PminusMat = Pminus;
+          for(int s=0; s<Ls; ++s){
+            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
+          }
        }
+      }

-        if(dag){
-            PplusMat.adjointInPlace();
-            PminusMat.adjointInPlace();
+      Eigen::MatrixXcd PplusMat ;
+      Eigen::MatrixXcd PminusMat;
+
+      if(inv) {
+        PplusMat  = Pplus.inverse();
+        PminusMat = Pminus.inverse();
+      } else {
+        PplusMat  = Pplus;
+        PminusMat = Pminus;
+      }
+
+      if(dag){
+        PplusMat.adjointInPlace();
+        PminusMat.adjointInPlace();
+      }
+
+      typedef typename SiteHalfSpinor::scalar_type scalar_type;
+      const int Nsimd = Simd::Nsimd();
+      Matp.resize(Ls*LLs);
+      Matm.resize(Ls*LLs);
+
+      for(int s2=0; s2<Ls; s2++){
+      for(int s1=0; s1<LLs; s1++){
+        int istride = LLs;
+        int ostride = 1;
+        Simd Vp;
+        Simd Vm;
+        scalar_type *sp = (scalar_type*) &Vp;
+        scalar_type *sm = (scalar_type*) &Vm;
+        for(int l=0; l<Nsimd; l++){
+          if(switcheroo<Coeff_t>::iscomplex()) {
+            sp[l] = PplusMat (l*istride+s1*ostride,s2);
+            sm[l] = PminusMat(l*istride+s1*ostride,s2);
+          } else {
+            // if real
+            scalar_type tmp;
+            tmp = PplusMat (l*istride+s1*ostride,s2);
+            sp[l] = scalar_type(tmp.real(),tmp.real());
+            tmp = PminusMat(l*istride+s1*ostride,s2);
+            sm[l] = scalar_type(tmp.real(),tmp.real());
+          }
        }
+        Matp[LLs*s2+s1] = Vp;
+        Matm[LLs*s2+s1] = Vm;
+      }}
+  }

-        typedef typename SiteHalfSpinor::scalar_type scalar_type;
-        const int Nsimd = Simd::Nsimd();
-        Matp.resize(Ls*LLs);
-        Matm.resize(Ls*LLs);
-
-        for(int s2=0; s2<Ls; s2++){
-        for(int s1=0; s1<LLs; s1++){
-            int istride = LLs;
-            int ostride = 1;
-            Simd Vp;
-            Simd Vm;
-            scalar_type *sp = (scalar_type*) &Vp;
-            scalar_type *sm = (scalar_type*) &Vm;
-            for(int l=0; l<Nsimd; l++){
-                if(switcheroo<Coeff_t>::iscomplex()) {
-                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
-                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
-                } else {
-                    // if real
-                    scalar_type tmp;
-                    tmp = PplusMat (l*istride+s1*ostride,s2);
-                    sp[l] = scalar_type(tmp.real(),tmp.real());
-                    tmp = PminusMat(l*istride+s1*ostride,s2);
-                    sm[l] = scalar_type(tmp.real(),tmp.real());
-                }
-            }
-            Matp[LLs*s2+s1] = Vp;
-            Matm[LLs*s2+s1] = Vm;
-        }}
-    }
-
-    FermOpTemplateInstantiate(MobiusEOFAFermion);
-    GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
+  FermOpTemplateInstantiate(MobiusEOFAFermion);
+  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);

 }}