simd in 5th dimension support

2025-09-19 09:41:05 +01:00 · 2016-04-19 15:38:01 -07:00
parent 806a83d38b
commit 9b6ab6db16
7 changed files with 172 additions and 369 deletions
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -288,12 +288,8 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
    if ( Impl::overlapCommsCompute () ) { 
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
    } else { 
    DhopInternalCommsThenCompute(st,U,in,out,dag);
  }
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 							 const FermionField &in, FermionField &out,int dag) {
@@ -331,15 +327,6 @@ PARALLEL_FOR_LOOP
  };
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 						     const FermionField &in, FermionField &out,int dag) {
    assert(0);
  };
  FermOpTemplateInstantiate(WilsonFermion);
  GparityFermOpTemplateInstantiate(WilsonFermion);
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -116,9 +116,6 @@ namespace Grid {
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
      // Constructor
      WilsonFermion(GaugeField &_Umu,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._checker_dim==1);
  // Dimension zero of the five-d is the Ls direction
@@ -106,6 +104,70 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  dslashtime=0;
  dslash1time=0;
 }  
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       GridRedBlackCartesian &FourDimRedBlackGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 {
  int nsimd = Simd::Nsimd();
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
  assert(FourDimGrid._ndimension==4);
  assert(FourDimRedBlackGrid._ndimension==4);
  // Dimension zero of the five-d is the Ls direction
  Ls=FiveDimGrid._fdimensions[0];
  assert(FiveDimGrid._processors[0]         ==1);
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
  assert(FiveDimRedBlackGrid._processors[0] ==1);
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
  // Other dimensions must match the decomposition of the four-D fields 
  for(int d=0;d<4;d++){
    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
    assert(FourDimGrid._simd_layout[d]=1);
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
  }
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -294,11 +356,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
+  int LLs = in._grid->_rdimensions[0];
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
@@ -318,97 +376,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
+	  int sF = s+LLs*sU;
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
+	for(int s=0;s<LLs;s++){
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	  int sU=ss;
-	    int sF = sd+Ls*sU;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp parallel for 
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=ss+ ssoff;
 	    if ( LebesgueOrder::UseLebesgueOrder ) {
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
      /*
 #pragma omp parallel for schedule(static)
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  sU=ss+ ssoff;
 	  for(int s=soff;s<soff+swork;s++){
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      */
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
+	  int sF = s+LLs*sU;
-	  int sF = s+Ls*sU;
+	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP     
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
 	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
+	  int sF = s+LLs*sU; 
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
@@ -418,251 +427,6 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
 						 DoubledGaugeField & U,
 						 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  st.HaloExchangeComplete(handle);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 #pragma omp parallel 
  {
  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp for
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=ss+ ssoff;
 	    if ( LebesgueOrder::UseLebesgueOrder ) {
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
  }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 						DoubledGaugeField & U,
 						const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  st.HaloExchangeComplete(handle);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 #pragma omp parallel 
  {
  for(int jjj=0;jjj<100;jjj++){
 #pragma omp barrier
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=0;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp for
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=0;
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 #pragma omp for
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=0;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
  }
  }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
 						     const FermionField &in, FermionField &out,int dag)
 {
  assert(0);
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@@ -706,6 +470,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -87,6 +87,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@@ -121,32 +122,12 @@ namespace Grid {
 			FermionField &out,
 			int dag);
      void DhopInternalOMPbench(StencilImpl & st,
 				LebesgueOrder &lo,
 				DoubledGaugeField &U,
 				const FermionField &in, 
 				FermionField &out,
 				int dag);
      void DhopInternalL1bench(StencilImpl & st,
 				LebesgueOrder &lo,
 				DoubledGaugeField &U,
 				const FermionField &in, 
 				FermionField &out,
 				int dag);
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
@@ -156,6 +137,15 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // Constructors
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 #endif
  FermOpTemplateInstantiate(WilsonKernels);
 template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  template class WilsonKernels<WilsonImplD>; 
  template class WilsonKernels<GparityWilsonImplF>;
  template class WilsonKernels<GparityWilsonImplD>;
  template class WilsonKernels<DomainWallRedBlack5dImplF>;
  template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
-    U_10 = ref()(1,0);\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
-    U_20 = ref()(2,0);\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
-    U_01 = ref()(0,1);\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
-    U_11 = ref()(1,1);				\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    U_10 = ref()(1,2);\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
  return 0;
 }
@@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
  //////////////
 /*
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
  return 0;
 }
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 template<>
 int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
  return 0;
 }
 */
 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
@@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 }}