Merge commit '1e554350acae0e67fa7177ed0db9d4f684a54af2'

2025-11-03 05:24:32 +00:00 · 2016-04-30 00:17:52 -07:00
parent ba09cbae3e 1e554350ac
commit f6c53e5039
33 changed files with 1549 additions and 520 deletions
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -142,6 +142,10 @@ namespace Grid {
        mult(&phi(),&U(mu),&chi());
      }

+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	reg = memory;
+      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP

    };

+
+
+    ///////
+    // Single flavour four spinors with colour index, 5d redblack
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    public:
+
+      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+      
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef Lattice<SiteSpinor>             FermionField;
+
+      // Make the doubled gauge field a *scalar*
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
+      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
+      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
+
+      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
+
+      ImplParams Params;
+
+      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      bool overlapCommsCompute(void) { return false; };
+    
+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	vsplat(reg,memory);
+      }
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+      {
+	SiteGaugeLink UU;
+	for(int i=0;i<Nrepresentation;i++){
+	  for(int j=0;j<Nrepresentation;j++){
+	    vsplat(UU()()(i,j),U(mu)()(i,j));
+	  }
+	}
+        mult(&phi(),&UU(),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	SiteScalarGaugeField  ScalarUmu;
+	SiteDoubledGaugeField ScalarUds;
+
+        GaugeLinkField U   (Umu._grid);
+	GaugeField     Uadj(Umu._grid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uadj,U,mu);
+	}
+
+	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
+	  std::vector<int> lcoor;
+	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
+
+	  peekLocalSite(ScalarUmu,Umu,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
+
+	  peekLocalSite(ScalarUmu,Uadj,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
+
+	  pokeLocalSite(ScalarUds,Uds,lcoor);
+	}
+
+      }
+	
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+	assert(0);
+      }
+
+    };
+
+
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 	conformable(Uds._grid,GaugeGrid);
 	conformable(Umu._grid,GaugeGrid);
 	
-	GaugeLinkField Utmp(GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Utmp (GaugeGrid);
+	GaugeLinkField U    (GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

+    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
+    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
+    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -288,11 +288,7 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
-    if ( Impl::overlapCommsCompute () ) { 
-      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
-    } else { 
-      DhopInternalCommsThenCompute(st,U,in,out,dag);
-    }
+    DhopInternalCommsThenCompute(st,U,in,out,dag);
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
@@ -330,15 +326,6 @@ PARALLEL_FOR_LOOP
    }
  };

-
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag) {
-
-    assert(0);
-
-  };
-
 
  FermOpTemplateInstantiate(WilsonFermion);
  GparityFermOpTemplateInstantiate(WilsonFermion);
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -116,9 +116,6 @@ namespace Grid {

      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-

      // Constructor
      WilsonFermion(GaugeField &_Umu,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
-  
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
-
  assert(FiveDimRedBlackGrid._checker_dim==1);

  // Dimension zero of the five-d is the Ls direction
@@ -106,11 +104,75 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  dslashtime=0;
  dslash1time=0;
 }  
+
+template<class Impl>
+WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
+				       RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  M5(_M5),
+  Umu(_FourDimGrid),
+  UmuEven(_FourDimRedBlackGrid),
+  UmuOdd (_FourDimRedBlackGrid),
+  Lebesgue(_FourDimGrid),
+  LebesgueEvenOdd(_FourDimRedBlackGrid)
+{
+  int nsimd = Simd::Nsimd();
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
+  assert(FourDimGrid._ndimension==4);
+  assert(FourDimRedBlackGrid._ndimension==4);
+
+  // Dimension zero of the five-d is the Ls direction
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+
+    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+
+    assert(FourDimGrid._simd_layout[d]=1);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+  }
+
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+}  
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-    GaugeField HUmu(_Umu._grid);
-    HUmu = _Umu*(-0.5);
+  GaugeField HUmu(_Umu._grid);
+  HUmu = _Umu*(-0.5);
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
@@ -294,15 +356,12 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
  Compressor compressor(dag);

  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
+  int LLs = in._grid->_rdimensions[0];
  
  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
+  //  auto handle = st.HaloExchangeBegin(in,compressor);
+  //  st.HaloExchangeComplete(handle);
+  st.HaloExchange(in,compressor);
  commtime +=usecond();

  jointime -=usecond();
@@ -318,97 +377,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU;
+	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();

-#pragma omp parallel for 
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU;
+	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
    } else if( this->HandOptDslash ) {
-      /*
-
-#pragma omp parallel for schedule(static)
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+ ssoff;
-	  for(int s=soff;s<soff+swork;s++){
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      */
-
 PARALLEL_FOR_LOOP     
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
@@ -418,251 +428,6 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }

-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
-						 DoubledGaugeField & U,
-						 const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-
-#pragma omp parallel 
-  {
-  for(int jjj=0;jjj<100;jjj++){
-#pragma omp barrier
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
-
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp for
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-#pragma omp for
-
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    }
-  }
-  }
-  }
-  dslashtime +=usecond();
-  alltime+=usecond();
-}
-
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
-						DoubledGaugeField & U,
-						const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-
-#pragma omp parallel 
-  {
-  for(int jjj=0;jjj<100;jjj++){
-#pragma omp barrier
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
-
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=0;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp for
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=0;
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-#pragma omp for
-
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    }
-  }
-  }
-  }
-  dslashtime +=usecond();
-  alltime+=usecond();
-}
-
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
-						     DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag)
-{
-  assert(0);
-}

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@@ -706,6 +471,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
+template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
+template class WilsonFermion5D<DomainWallRedBlack5dImplD>;

 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -87,6 +87,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@@ -121,32 +122,12 @@ namespace Grid {
 			FermionField &out,
 			int dag);

-      void DhopInternalOMPbench(StencilImpl & st,
-				LebesgueOrder &lo,
-				DoubledGaugeField &U,
-				const FermionField &in, 
-				FermionField &out,
-				int dag);
-
-      void DhopInternalL1bench(StencilImpl & st,
-				LebesgueOrder &lo,
-				DoubledGaugeField &U,
-				const FermionField &in, 
-				FermionField &out,
-				int dag);
-
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);

      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
@@ -156,6 +137,15 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());

+      // Constructors
+      WilsonFermion5D(int simd, 
+		      GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      double _M5,const ImplParams &p= ImplParams());
+
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);

--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 #endif

  FermOpTemplateInstantiate(WilsonKernels);
+template class WilsonKernels<DomainWallRedBlack5dImplF>;		
+template class WilsonKernels<DomainWallRedBlack5dImplD>;

 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
-#if defined(AVX512) || defined (IMCI)
+#if defined(AVX512) 
 //#if defined (IMCI)

 #include <simd/Intel512wilson.h>
@@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  template class WilsonKernels<WilsonImplD>; 
  template class WilsonKernels<GparityWilsonImplF>;
  template class WilsonKernels<GparityWilsonImplD>;
+  template class WilsonKernels<DomainWallRedBlack5dImplF>;
+  template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);

+// To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
-    U_10 = ref()(1,0);\
-    U_20 = ref()(2,0);\
-    U_01 = ref()(0,1);\
-    U_11 = ref()(1,1);				\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
-    U_10 = ref()(1,2);\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

+
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  //check consistency of return types between these functions and the ones in WilsonKernels.cc
  return 0;
  
 }
@@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D



+  //////////////
+/*
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
+  return 0;
+  
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+*/
+
 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
@@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

+
+
+
+template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+
+
 }}
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;	
+    typedef typename GImpl::GaugeField               GaugeField;\
+    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
+    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;


    // 
--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+{
+  std::vector<int> simd(4,1);
+  return makeFourDimGrid(latt,simd,mpi);
+}
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
  return new GridCartesian(latt5,simd5,mpi5); 
 }

+
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }

+
+GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+  }
+  return new GridCartesian(latt5,simd5,mpi5); 
+}
+
+GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+  int cbd=0;
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  std::vector<int>   cb5(1,1);
+    
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+      cb5.push_back(1);
+    }
+  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+}
+
+
 }}
--- a/lib/qcd/utils/SpaceTimeGrid.h
+++ b/lib/qcd/utils/SpaceTimeGrid.h
@@ -35,9 +35,14 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
+
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);

+  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
+  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
+  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+
 };

 }}