Internal SHM comms in non-simd directions working

Need to fix simd directions
2026-05-31 06:24:18 +01:00 · 2016-10-22 18:14:27 +01:00
parent 0fcd2e7188
commit c190221fd3
16 changed files with 1729 additions and 1739 deletions
@@ -153,7 +153,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    Dw.Report();
@@ -192,7 +192,7 @@ int main (int argc, char ** argv)

    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    sDw.Report();
  
    if(0){
@@ -262,7 +262,7 @@ int main (int argc, char ** argv)
      double flops=(1344.0*volume*ncall)/2;

      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.Report();

      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@@ -333,7 +333,7 @@ int main (int argc, char ** argv)
    double flops=(1344.0*volume*ncall)/2;

    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -80,7 +80,6 @@ class CartesianCommunicator {

    void * ShmCommBuf;
    std::vector<void *> ShmCommBufs;
-    std::vector<void *> ShmStencilBufs;

    int WorldRank;
    int WorldSize;
@@ -105,6 +104,10 @@ class CartesianCommunicator {
    int  RankFromProcessorCoor(std::vector<int> &coor);
    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);

+    // Helper function for SHM Windows in MPI3
+    void *ShmBufferSelf(void);
+    void *ShmBuffer(int rank);
+
    /////////////////////////////////
    // Grid information queries
    /////////////////////////////////
@@ -173,6 +176,16 @@ class CartesianCommunicator {
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+    void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			 void *xmit,
+			 int xmit_to_rank,
+			 void *recv,
+			 int recv_from_rank,
+			 int bytes);
+    void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+    {
+      SendToRecvFromComplete(waitall);
+    }

    ////////////////////////////////////////////////////////////
    // Barrier
@@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  
  assert(Size==_Nprocessors);
 }
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
@@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // Verbose for now
    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::cout<< "Ranks per node "<< ShmSize << std::endl;
-    std::cout<< "Nodes          "<< GroupSize << std::endl;
-    std::cout<< "Ranks          "<< WorldSize << std::endl;
-    std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
+    std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
+    std::cout<< " Nodes "<< GroupSize;
+    std::cout<< " Ranks "<< WorldSize;
+    std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;

    // Done
    ShmSetup=1;
@@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  }

  ShmCommBufs.resize(ShmSize);
-  ShmStencilBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
-    ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
  }
  
  ////////////////////////////////////////////////////////////////
@@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
+
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
@@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  }
 }

+
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return ShmCommBufs[ShmRank];
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  int gpeer = GroupRanks[rank];
+  if (gpeer == MPI_UNDEFINED){
+    return NULL;
+  } else { 
+    return ShmCommBufs[gpeer];
+  }
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
@@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-#undef SHM_USE_BCOPY
  MPI_Request xrq;
  MPI_Request rrq;
  
  static int sequence;

-  int rank = _processor;
  int ierr;
  int tag;
  int check;
@@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  assert(from != _processor);
  
  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];

  sequence++;
@@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis

  int small = (bytes<MAX_MPI_SHM_BYTES);

-#ifndef SHM_USE_BCOPY
  typedef vRealD T;
  int words = bytes/sizeof(T);
-  assert(((size_t)bytes &(sizeof(T)-1))==0);
-  //  assert(((size_t)xmit  &(sizeof(T)-1))==0);
-  //  assert(((size_t)recv  &(sizeof(T)-1))==0);
-#endif

+  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);

-  //  std::cerr << "proc dest from gme  gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
-  if ( small && (dest !=MPI_UNDEFINED) ) {
+  if ( small && (gdest !=MPI_UNDEFINED) ) {
+
    assert(gme != gdest);

-#ifdef SHM_USE_BCOPY
-    bcopy(xmit,to_ptr,bytes);
-#else
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
+
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
@@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    list.push_back(xrq);
  }
  
-  //  std::cout << "Syncing "<<std::endl; Barrier();
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   

-  //  std::cout << "Receiving "<<std::endl; Barrier();
-  
-  if (small && (from !=MPI_UNDEFINED) ) {
-#ifdef SHM_USE_BCOPY
-    bcopy(from_ptr,recv,bytes);
-#else
+  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
-    PARALLEL_FOR_LOOP 
+PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      vstream(op[w],ip[w]);
    }
-#endif
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
@@ -439,27 +438,51 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
    list.push_back(rrq);
  }

-  //  std::cout << "Syncing"<<std::endl; Barrier();
+  MPI_Win_sync (ShmWindow);   
+  MPI_Barrier  (ShmComm);
+  MPI_Win_sync (ShmWindow);   
+}
+
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int dest,
+						       void *recv,
+						       int from,
+						       int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  int ierr;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+  
+  int gdest = GroupRanks[dest];
+  int gfrom = GroupRanks[from];
+  int gme   = GroupRanks[_processor];
+
+  assert(gme == ShmRank);
+
+  if ( gdest == MPI_UNDEFINED ) {
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    assert(ierr==0);
+    list.push_back(xrq);
+  }
+  
+  if ( gfrom ==MPI_UNDEFINED) {
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    assert(ierr==0);
+    list.push_back(rrq);
+  }

  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
  
-#if 0
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
-  
-  assert(ierr==0);
-
-  list.push_back(xrq);
-  list.push_back(rrq);
-#endif
 }

+
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
@@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 }

 int Rank(void ){ return 0; };
+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -50,6 +50,14 @@ typedef struct HandShake_t {
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;

+void *CartesianCommunicator::ShmBufferSelf(void)
+{
+  return NULL;
+}
+void *CartesianCommunicator::ShmBuffer(int rank)
+{
+  return NULL;
+}
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -33,511 +33,500 @@ directory
 #define GRID_QCD_FERMION_OPERATOR_IMPL_H

 namespace Grid {
-
-  namespace QCD {
+namespace QCD {


-    //////////////////////////////////////////////
-    // Template parameter class constructs to package
-    // externally control Fermion implementations
-    // in orthogonal directions
-    //
-    // Ultimately need Impl to always define types where XXX is opaque
-    //
-    //    typedef typename XXX               Simd;
-    //    typedef typename XXX     GaugeLinkField;	
-    //    typedef typename XXX         GaugeField;
-    //    typedef typename XXX      GaugeActField;
-    //    typedef typename XXX       FermionField;
-    //    typedef typename XXX  DoubledGaugeField;
-    //    typedef typename XXX         SiteSpinor;
-    //    typedef typename XXX     SiteHalfSpinor;	
-    //    typedef typename XXX         Compressor;	
-    //
-    // and Methods:
-    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
-    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
-    //
-    //
-    // To acquire the typedefs from "Base" (either a base class or template param) use:
-    //
-    // INHERIT_GIMPL_TYPES(Base)
-    // INHERIT_FIMPL_TYPES(Base)
-    // INHERIT_IMPL_TYPES(Base)
-    //
-    // The Fermion operators will do the following:
-    //
-    // struct MyOpParams { 
-    //   RealD mass;
-    // };
-    //
-    //
-    // template<class Impl>
-    // class MyOp : public<Impl> { 
-    // public:
-    //
-    //    INHERIT_ALL_IMPL_TYPES(Impl);
-    //
-    //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
-    //    {
-    //
-    //    };
-    //    
-    //  }
-    //////////////////////////////////////////////
+  //////////////////////////////////////////////
+  // Template parameter class constructs to package
+  // externally control Fermion implementations
+  // in orthogonal directions
+  //
+  // Ultimately need Impl to always define types where XXX is opaque
+  //
+  //    typedef typename XXX               Simd;
+  //    typedef typename XXX     GaugeLinkField;	
+  //    typedef typename XXX         GaugeField;
+  //    typedef typename XXX      GaugeActField;
+  //    typedef typename XXX       FermionField;
+  //    typedef typename XXX  DoubledGaugeField;
+  //    typedef typename XXX         SiteSpinor;
+  //    typedef typename XXX     SiteHalfSpinor;	
+  //    typedef typename XXX         Compressor;	
+  //
+  // and Methods:
+  //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+  //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+  //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+  //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
+  //
+  //
+  // To acquire the typedefs from "Base" (either a base class or template param) use:
+  //
+  // INHERIT_GIMPL_TYPES(Base)
+  // INHERIT_FIMPL_TYPES(Base)
+  // INHERIT_IMPL_TYPES(Base)
+  //
+  // The Fermion operators will do the following:
+  //
+  // struct MyOpParams { 
+  //   RealD mass;
+  // };
+  //
+  //
+  // template<class Impl>
+  // class MyOp : public<Impl> { 
+  // public:
+  //
+  //    INHERIT_ALL_IMPL_TYPES(Impl);
+  //
+  //    MyOp(MyOpParams Myparm, ImplParams &ImplParam) :  Impl(ImplParam)
+  //    {
+  //
+  //    };
+  //    
+  //  }
+  //////////////////////////////////////////////
  

-    ////////////////////////////////////////////////////////////////////////
-    // Implementation dependent fermion types
-    ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  // Implementation dependent fermion types
+  ////////////////////////////////////////////////////////////////////////
  
 #define INHERIT_FIMPL_TYPES(Impl)\
-    typedef typename Impl::FermionField           FermionField;		\
-    typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
-    typedef typename Impl::SiteSpinor               SiteSpinor;		\
-    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
-    typedef typename Impl::Compressor               Compressor;		\
-    typedef typename Impl::StencilImpl             StencilImpl;		\
-    typedef typename Impl::ImplParams ImplParams;			\
-    typedef typename Impl::Coeff_t       Coeff_t;
+  typedef typename Impl::FermionField           FermionField;		\
+  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
+  typedef typename Impl::SiteSpinor               SiteSpinor;		\
+  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
+  typedef typename Impl::Compressor               Compressor;		\
+  typedef typename Impl::StencilImpl             StencilImpl;		\
+  typedef typename Impl::ImplParams ImplParams;				\
+  typedef typename Impl::Coeff_t       Coeff_t;
  
 #define INHERIT_IMPL_TYPES(Base) \
-    INHERIT_GIMPL_TYPES(Base)	 \
-    INHERIT_FIMPL_TYPES(Base)
+  INHERIT_GIMPL_TYPES(Base)	 \
+  INHERIT_FIMPL_TYPES(Base)
  
-    ///////
-    // Single flavour four spinors with colour index
-    ///////
-    template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
-    class WilsonImpl
-      : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
-    public:
-      static const int Dimension = Representation::Dimension;
-      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  /////////////////////////////////////////////////////////////////////////////
+  // Single flavour four spinors with colour index
+  /////////////////////////////////////////////////////////////////////////////
+  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
+  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {

-      //Necessary?
-      constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
-
-      const bool LsVectorised=false;
-      typedef _Coeff_t Coeff_t;
-
-
-      INHERIT_GIMPL_TYPES(Gimpl);
-      
-      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
-      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
-      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
-      
-      typedef iImplSpinor<Simd>            SiteSpinor;
-      typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
-      
-      typedef Lattice<SiteSpinor>            FermionField;
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
-      
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
-      
-      ImplParams Params;
-      
-      WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
-      
-      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
-      
-      inline void multLink(SiteHalfSpinor &phi,
-			   const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi,
-			   int mu,
-			   StencilEntry *SE,
-			   StencilImpl &St) {
-	mult(&phi(), &U(mu), &chi());
-      }
-      
-      template <class ref>
-      inline void loadLinkElement(Simd &reg,
-				  ref &memory) {
-	reg = memory;
-      }
-      
-      inline void DoubleStore(GridBase *GaugeGrid,
-			      DoubledGaugeField &Uds,
-			      const GaugeField &Umu) {
-	conformable(Uds._grid, GaugeGrid);
-	conformable(Umu._grid, GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
-	for (int mu = 0; mu < Nd; mu++) {
-	  U = PeekIndex<LorentzIndex>(Umu, mu);
-	  PokeIndex<LorentzIndex>(Uds, U, mu);
-	  U = adj(Cshift(U, mu, -1));
-	  PokeIndex<LorentzIndex>(Uds, U, mu + 4);
-	}
-      }
-
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
-	GaugeLinkField link(mat._grid);
-	link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
-	PokeIndex<LorentzIndex>(mat,link,mu);
-      }   
-      
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-	
-	int Ls=Btilde._grid->_fdimensions[0];
-	GaugeLinkField tmp(mat._grid);
-	tmp = zero;
-
-        PARALLEL_FOR_LOOP
-	  for(int sss=0;sss<tmp._grid->oSites();sss++){
-	    int sU=sss;
-	    for(int s=0;s<Ls;s++){
-	      int sF = s+Ls*sU;
-	      tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
-	    }
-	  }
-	PokeIndex<LorentzIndex>(mat,tmp,mu);
-
-      }
-    };
-
-    ///////
-    // Single flavour four spinors with colour index, 5d redblack
-    ///////
-    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
-    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:

-      static const int Dimension = Nrepresentation;
-      const bool LsVectorised=true;
-      typedef _Coeff_t Coeff_t;      
-      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+    static const int Dimension = Representation::Dimension;
+    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
      
-      INHERIT_GIMPL_TYPES(Gimpl);
+    //Necessary?
+    constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
    
-      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
-      template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
-      template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
-      template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
-      template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+    const bool LsVectorised=false;
+    typedef _Coeff_t Coeff_t;

-      typedef iImplSpinor<Simd> SiteSpinor;
-      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
-      typedef Lattice<SiteSpinor> FermionField;
+    INHERIT_GIMPL_TYPES(Gimpl);
      
-      // Make the doubled gauge field a *scalar*
-      typedef iImplDoubledGaugeField<typename Simd::scalar_type>
-      SiteDoubledGaugeField;  // This is a scalar
-      typedef iImplGaugeField<typename Simd::scalar_type>
-      SiteScalarGaugeField;  // scalar
-      typedef iImplGaugeLink<typename Simd::scalar_type>
-      SiteScalarGaugeLink;  // scalar
+    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
+    template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+    template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
    
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    typedef iImplSpinor<Simd>            SiteSpinor;
+    typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+    typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
    
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonImplParams ImplParams;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+    typedef Lattice<SiteSpinor>            FermionField;
+    typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
    
-      ImplParams Params;
+    typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+    typedef WilsonImplParams ImplParams;
+    typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
    
-      DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+    ImplParams Params;
    
-      bool overlapCommsCompute(void) { return false; };
+    WilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
      
-      template <class ref>
-      inline void loadLinkElement(Simd &reg, ref &memory) {
-	vsplat(reg, memory);
-      }
-      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-			   StencilImpl &St) {
-	SiteGaugeLink UU;
-	for (int i = 0; i < Nrepresentation; i++) {
-	  for (int j = 0; j < Nrepresentation; j++) {
-	    vsplat(UU()()(i, j), U(mu)()(i, j));
-	  }
-	}
-	mult(&phi(), &UU(), &chi());
+    bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
+      
+    inline void multLink(SiteHalfSpinor &phi,
+			 const SiteDoubledGaugeField &U,
+			 const SiteHalfSpinor &chi,
+			 int mu,
+			 StencilEntry *SE,
+			 StencilImpl &St) {
+      mult(&phi(), &U(mu), &chi());
+    }
+      
+    template <class ref>
+    inline void loadLinkElement(Simd &reg, ref &memory) {
+      reg = memory;
+    }
+      
+    inline void DoubleStore(GridBase *GaugeGrid,
+			    DoubledGaugeField &Uds,
+			    const GaugeField &Umu) {
+      conformable(Uds._grid, GaugeGrid);
+      conformable(Umu._grid, GaugeGrid);
+      GaugeLinkField U(GaugeGrid);
+      for (int mu = 0; mu < Nd; mu++) {
+	U = PeekIndex<LorentzIndex>(Umu, mu);
+	PokeIndex<LorentzIndex>(Uds, U, mu);
+	U = adj(Cshift(U, mu, -1));
+	PokeIndex<LorentzIndex>(Uds, U, mu + 4);
      }
+    }

-      inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
-			      const GaugeField &Umu) {
-	SiteScalarGaugeField ScalarUmu;
-	SiteDoubledGaugeField ScalarUds;
+    inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+      GaugeLinkField link(mat._grid);
+      link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+      PokeIndex<LorentzIndex>(mat,link,mu);
+    }   
      
-	GaugeLinkField U(Umu._grid);
-	GaugeField Uadj(Umu._grid);
-	for (int mu = 0; mu < Nd; mu++) {
-	  U = PeekIndex<LorentzIndex>(Umu, mu);
-	  U = adj(Cshift(U, mu, -1));
-	  PokeIndex<LorentzIndex>(Uadj, U, mu);
-	}
+    inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
      
-	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
-	  std::vector<int> lcoor;
-	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      int Ls=Btilde._grid->_fdimensions[0];
+      GaugeLinkField tmp(mat._grid);
+      tmp = zero;
      
-	  peekLocalSite(ScalarUmu, Umu, lcoor);
-	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
-	  
-	  peekLocalSite(ScalarUmu, Uadj, lcoor);
-	  for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
-	  
-	  pokeLocalSite(ScalarUds, Uds, lcoor);
+      PARALLEL_FOR_LOOP
+      for(int sss=0;sss<tmp._grid->oSites();sss++){
+	int sU=sss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
 	}
      }
+      PokeIndex<LorentzIndex>(mat,tmp,mu);
      
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////////
+  // Single flavour four spinors with colour index, 5d redblack
+  ////////////////////////////////////////////////////////////////////////////////////
+
+template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
+class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+  public:
+      
+  static const int Dimension = Nrepresentation;
+  const bool LsVectorised=true;
+  typedef _Coeff_t Coeff_t;      
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
+  
+  INHERIT_GIMPL_TYPES(Gimpl);
+  
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
+  template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
+  template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+  
+  typedef iImplSpinor<Simd> SiteSpinor;
+  typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+  typedef Lattice<SiteSpinor> FermionField;
+  
+  // Make the doubled gauge field a *scalar*
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
+      
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+      
+  typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+  
+  ImplParams Params;
+  
+  DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
+      
+  bool overlapCommsCompute(void) { return false; };
+      
+  template <class ref>
+  inline void loadLinkElement(Simd &reg, ref &memory) {
+    vsplat(reg, memory);
+  }
+
+  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+		       StencilImpl &St) {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Nrepresentation; i++) {
+      for (int j = 0; j < Nrepresentation; j++) {
+	vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
+  {
+    SiteScalarGaugeField ScalarUmu;
+    SiteDoubledGaugeField ScalarUds;
+    
+    GaugeLinkField U(Umu._grid);
+    GaugeField Uadj(Umu._grid);
+    for (int mu = 0; mu < Nd; mu++) {
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+      U = adj(Cshift(U, mu, -1));
+      PokeIndex<LorentzIndex>(Uadj, U, mu);
+    }
+    
+    for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
+      std::vector<int> lcoor;
+      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
+      
+      peekLocalSite(ScalarUmu, Umu, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      
+      peekLocalSite(ScalarUmu, Uadj, lcoor);
+      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      
+      pokeLocalSite(ScalarUds, Uds, lcoor);
+    }
+  }
+      
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
+  {
+    assert(0);
+  }
+      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
+  {
 	assert(0);
-      }
-      
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
-	assert(0);
-      }
-    };
+  }
+};
    
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
    
-    template <class S, int Nrepresentation,class _Coeff_t = RealD>
-    class GparityWilsonImpl
-      : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
-    public:
-      static const int Dimension = Nrepresentation;
+template <class S, int Nrepresentation,class _Coeff_t = RealD>
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
+ public:

-      const bool LsVectorised=false;
+ static const int Dimension = Nrepresentation;

-      typedef _Coeff_t Coeff_t;
-      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
+ const bool LsVectorised=false;

-      INHERIT_GIMPL_TYPES(Gimpl);
+ typedef _Coeff_t Coeff_t;
+ typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
 
-      template <typename vtype>
-      using iImplSpinor =
-      iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
-      template <typename vtype>
-      using iImplHalfSpinor =
-	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
-      template <typename vtype>
-      using iImplDoubledGaugeField =
-	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
+ INHERIT_GIMPL_TYPES(Gimpl);
      
-      typedef iImplSpinor<Simd> SiteSpinor;
-      typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
-      typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
      
-      typedef Lattice<SiteSpinor> FermionField;
-      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+ typedef iImplSpinor<Simd> SiteSpinor;
+ typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
+ typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
 
-      typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
-      typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
+ typedef Lattice<SiteSpinor> FermionField;
+ typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 
-      typedef GparityWilsonImplParams ImplParams;
+ typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
+ typedef WilsonStencil<SiteSpinor, SiteHalfSpinor> StencilImpl;
 
-      ImplParams Params;
+ typedef GparityWilsonImplParams ImplParams;
      
+ ImplParams Params;

-      GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+ GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};

-      bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
+ bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };

-      // provide the multiply by link that is differentiated between Gparity (with
-      // flavour index) and non-Gparity
-      inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-			   const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-			   StencilImpl &St) {
-	typedef SiteHalfSpinor vobj;
-	typedef typename SiteHalfSpinor::scalar_object sobj;
+ // provide the multiply by link that is differentiated between Gparity (with
+ // flavour index) and non-Gparity
+ inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
+		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+		      StencilImpl &St) {

-	vobj vtmp;
-	sobj stmp;
+  typedef SiteHalfSpinor vobj;
+   typedef typename SiteHalfSpinor::scalar_object sobj;
 	
-	GridBase *grid = St._grid;
+   vobj vtmp;
+   sobj stmp;
 	
-	const int Nsimd = grid->Nsimd();
+   GridBase *grid = St._grid;
 	
-	int direction = St._directions[mu];
-	int distance = St._distances[mu];
-	int ptype = St._permute_type[mu];
-	int sl = St._grid->_simd_layout[direction];
+   const int Nsimd = grid->Nsimd();
 	
-	// Fixme X.Y.Z.T hardcode in stencil
-	int mmu = mu % Nd;
+   int direction = St._directions[mu];
+   int distance = St._distances[mu];
+   int ptype = St._permute_type[mu];
+   int sl = St._grid->_simd_layout[direction];
   
-	// assert our assumptions
-	assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
-	assert((sl == 1) || (sl == 2));
+   // Fixme X.Y.Z.T hardcode in stencil
+   int mmu = mu % Nd;
 	
-	std::vector<int> icoor;
+   // assert our assumptions
+   assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
+   assert((sl == 1) || (sl == 2));
   
-	if ( SE->_around_the_world && Params.twists[mmu] ) {
+   std::vector<int> icoor;
 	
-	  if ( sl == 2 ) {
+   if ( SE->_around_the_world && Params.twists[mmu] ) {

-	    std::vector<sobj> vals(Nsimd);
+     if ( sl == 2 ) {
       
-	    extract(chi,vals);
-	    for(int s=0;s<Nsimd;s++){
+       std::vector<sobj> vals(Nsimd);

-	      grid->iCoorFromIindex(icoor,s);
+       extract(chi,vals);
+       for(int s=0;s<Nsimd;s++){

-	      assert((icoor[direction]==0)||(icoor[direction]==1));
+	 grid->iCoorFromIindex(icoor,s);
 	      
-	      int permute_lane;
-	      if ( distance == 1) {
-		permute_lane = icoor[direction]?1:0;
-	      } else {
-		permute_lane = icoor[direction]?0:1;
+	 assert((icoor[direction]==0)||(icoor[direction]==1));
+	      
+	 int permute_lane;
+	 if ( distance == 1) {
+	   permute_lane = icoor[direction]?1:0;
+	 } else {
+	   permute_lane = icoor[direction]?0:1;
+	 }
+	      
+	 if ( permute_lane ) { 
+	   stmp(0) = vals[s](1);
+	   stmp(1) = vals[s](0);
+	   vals[s] = stmp;
 	      }
+       }
+       merge(vtmp,vals);
 	    
-	      if ( permute_lane ) { 
-		stmp(0) = vals[s](1);
-		stmp(1) = vals[s](0);
-		vals[s] = stmp;
-	      }
-	    }
-	    merge(vtmp,vals);
+     } else { 
+       vtmp(0) = chi(1);
+       vtmp(1) = chi(0);
+     }
+     mult(&phi(0),&U(0)(mu),&vtmp(0));
+     mult(&phi(1),&U(1)(mu),&vtmp(1));
     
-	  } else { 
-	    vtmp(0) = chi(1);
-	    vtmp(1) = chi(0);
-	  }
-	  mult(&phi(0),&U(0)(mu),&vtmp(0));
-	  mult(&phi(1),&U(1)(mu),&vtmp(1));
+   } else { 
+     mult(&phi(0),&U(0)(mu),&chi(0));
+     mult(&phi(1),&U(1)(mu),&chi(1));
+   }
   
-	} else { 
-	  mult(&phi(0),&U(0)(mu),&chi(0));
-	  mult(&phi(1),&U(1)(mu),&chi(1));
-	}
+ }

-  }
+ inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+ {
+   conformable(Uds._grid,GaugeGrid);
+   conformable(Umu._grid,GaugeGrid);
   
-      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-      {
+   GaugeLinkField Utmp (GaugeGrid);
+   GaugeLinkField U    (GaugeGrid);
+   GaugeLinkField Uconj(GaugeGrid);
   
-	conformable(Uds._grid,GaugeGrid);
-	conformable(Umu._grid,GaugeGrid);
+   Lattice<iScalar<vInteger> > coor(GaugeGrid);
 	
-	GaugeLinkField Utmp (GaugeGrid);
-	GaugeLinkField U    (GaugeGrid);
-	GaugeLinkField Uconj(GaugeGrid);
+   for(int mu=0;mu<Nd;mu++){
 	  
-	Lattice<iScalar<vInteger> > coor(GaugeGrid);
+     LatticeCoordinate(coor,mu);
+	  
+     U     = PeekIndex<LorentzIndex>(Umu,mu);
+     Uconj = conjugate(U);
+     
+     // This phase could come from a simple bc 1,1,-1,1 ..
+     int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
+     if ( Params.twists[mu] ) { 
+       Uconj = where(coor==neglink,-Uconj,Uconj);
+     }
+	  
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](0)(mu) = U[ss]();
+       Uds[ss](1)(mu) = Uconj[ss]();
+     }
+	  
+     U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
+     Uconj = adj(Cshift(Uconj,mu,-1));
+ 
+     Utmp = U;
+     if ( Params.twists[mu] ) { 
+       Utmp = where(coor==0,Uconj,Utmp);
+     }
+	  
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](0)(mu+4) = Utmp[ss]();
+     }
+	  
+     Utmp = Uconj;
+     if ( Params.twists[mu] ) { 
+       Utmp = where(coor==0,U,Utmp);
+     }
+	  
+PARALLEL_FOR_LOOP
+     for(auto ss=U.begin();ss<U.end();ss++){
+       Uds[ss](1)(mu+4) = Utmp[ss]();
+     }
+	  
+   }
+ }
      
      
-	for(int mu=0;mu<Nd;mu++){
+ inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {

-	  LatticeCoordinate(coor,mu);
+   // DhopDir provides U or Uconj depending on coor/flavour.
+   GaugeLinkField link(mat._grid);
+   // use lorentz for flavour as hack.
+   auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
+PARALLEL_FOR_LOOP
+   for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+     link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
+   }
+   PokeIndex<LorentzIndex>(mat, link, mu);
+   return;
+ }
      
-	  U     = PeekIndex<LorentzIndex>(Umu,mu);
-	  Uconj = conjugate(U);
+ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

-	  // This phase could come from a simple bc 1,1,-1,1 ..
-	  int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
-	  if ( Params.twists[mu] ) { 
-	    Uconj = where(coor==neglink,-Uconj,Uconj);
-	  }
+   int Ls = Btilde._grid->_fdimensions[0];
 	
+   GaugeLinkField tmp(mat._grid);
+   tmp = zero;
+PARALLEL_FOR_LOOP
+   for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
+     for (int s = 0; s < Ls; s++) {
+       int sF = s + Ls * ss;
+       auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
+       tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
+     }
+   }
+   PokeIndex<LorentzIndex>(mat, tmp, mu);
+   return;
+ }

-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](0)(mu) = U[ss]();
-	      Uds[ss](1)(mu) = Uconj[ss]();
-	    }
+};

-	  U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
-	  Uconj = adj(Cshift(Uconj,mu,-1));
+ typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
+ typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double

-	  Utmp = U;
-	  if ( Params.twists[mu] ) { 
-	    Utmp = where(coor==0,Uconj,Utmp);
-	  }
+ typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
+ typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
 
-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](0)(mu+4) = Utmp[ss]();
-	    }
+ typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
+ typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
 
-	  Utmp = Uconj;
-	  if ( Params.twists[mu] ) { 
-	    Utmp = where(coor==0,U,Utmp);
-	  }
+ typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
+ typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
+ typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
 
-	  PARALLEL_FOR_LOOP
-	    for(auto ss=U.begin();ss<U.end();ss++){
-	      Uds[ss](1)(mu+4) = Utmp[ss]();
-	    }
+ typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+ typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+ typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 
-	}
-      }
+ typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
+ typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
+ typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
 
+ typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
+ typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
+ typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double

-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
-				FermionField &A, int mu) {
-	// DhopDir provides U or Uconj depending on coor/flavour.
-	GaugeLinkField link(mat._grid);
-	// use lorentz for flavour as hack.
-	auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-	PARALLEL_FOR_LOOP
-	  for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
-	    link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
-	  }
-	PokeIndex<LorentzIndex>(mat, link, mu);
-	return;
-      }
+}}

-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
-				FermionField &Atilde, int mu) {
-	int Ls = Btilde._grid->_fdimensions[0];
-	
-	GaugeLinkField tmp(mat._grid);
-	tmp = zero;
-	PARALLEL_FOR_LOOP
-	  for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
-	    for (int s = 0; s < Ls; s++) {
-	      int sF = s + Ls * ss;
-	      auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
-	      tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
-	    }
-	  }
-	PokeIndex<LorentzIndex>(mat, tmp, mu);
-	return;
-      }
-    };
-
-    typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
-    typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
-
-
-    typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
-    typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
-
-    typedef WilsonImpl<vComplex,  AdjointRepresentation > WilsonAdjImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF;  // Float
-    typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD;  // Double
-
-    typedef WilsonImpl<vComplex,  TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR;   // Real.. whichever prec
-    typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF;  // Float
-    typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD;  // Double
-
-    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
-    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
-    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
-    
-    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
-    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
-    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
-
-    typedef GparityWilsonImpl<vComplex, Nc>  GparityWilsonImplR;  // Real.. whichever prec
-    typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
-    typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
-}
-}
 #endif
@@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    ////////////////////////
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
+      Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                               gamma);
    }

@@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,

  PARALLEL_FOR_LOOP
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
+    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
                             dirdisp, gamma);
  }
 };
@@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  if (dag == DaggerYes) {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                   out);
    }
  } else {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                out);
    }
  }
@@ -184,44 +184,37 @@ void WilsonFermion5D<Impl>::Report(void)

  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls  << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
-              << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
-              << DhopCommTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
-              << DhopComputeTime << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
-              << DhopComputeTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;

    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;

   }

  if ( DerivCalls > 0 ) {
-  std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
-  std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
-
-
-
-  RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
-  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+    std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls    : " <<DerivCalls <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " <<DerivComputeTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    
+    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
  }

  if (DerivCalls > 0 || DhopCalls > 0){
-  std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
-  std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-  std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
  }
 }

@@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
    }
  }
 };
@@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
        assert(sF < B._grid->oSites());
        assert(sU < U._grid->oSites());

-        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
-                                 gamma);
+        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);

        ////////////////////////////
        // spin trace outer product
@@ -342,10 +334,10 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 }

 template<class Impl>
-void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,
-              const FermionField &A,
-              const FermionField &B,
-              int dag)
+void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
+				      const FermionField &A,
+				      const FermionField &B,
+				      int dag)
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
@@ -358,9 +350,9 @@ void WilsonFermion5D<Impl>::DhopDeriv(      GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
-          const FermionField &A,
-          const FermionField &B,
-          int dag)
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -376,9 +368,9 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
-          const FermionField &A,
-          const FermionField &B,
-          int dag)
+					const FermionField &A,
+					const FermionField &B,
+					int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
  conformable(GaugeRedBlackGrid(),mat._grid);
@@ -393,8 +385,8 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
-           DoubledGaugeField & U,
-           const FermionField &in, FermionField &out,int dag)
+					 DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);
@@ -412,27 +404,25 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                   out);
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
    }
 #ifdef AVX512
  } else if (stat.is_init() ) {

    int nthreads;
    stat.start();
-    #pragma omp parallel
+#pragma omp parallel
    {
-    #pragma omp master
+#pragma omp master
    nthreads = omp_get_num_threads();
    int mythread = omp_get_thread_num();
    stat.enter(mythread);
-    #pragma omp for nowait
-   for(int ss=0;ss<U._grid->oSites();ss++)
-    {
-       int sU=ss;
-       int sF=LLs*sU;
-       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
-     }
+#pragma omp for nowait
+    for(int ss=0;ss<U._grid->oSites();ss++) {
+      int sU=ss;
+      int sF=LLs*sU;
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
+    }
    stat.exit(mythread);
    }
    stat.accum(nthreads);
@@ -442,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
-                                out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
  }
  DhopComputeTime+=usecond();
@@ -34,155 +34,154 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Stat.h>

 namespace Grid {
+namespace QCD {

-  namespace QCD {
+  ////////////////////////////////////////////////////////////////////////////////
+  // This is the 4d red black case appropriate to support
+  //
+  // parity = (x+y+z+t)|2;
+  // generalised five dim fermions like mobius, zolotarev etc..	
+  //
+  // i.e. even even contains fifth dim hopping term.
+  //
+  // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
+  ////////////////////////////////////////////////////////////////////////////////

-    ////////////////////////////////////////////////////////////////////////////////
-    // This is the 4d red black case appropriate to support
-    //
-    // parity = (x+y+z+t)|2;
-    // generalised five dim fermions like mobius, zolotarev etc..	
-    //
-    // i.e. even even contains fifth dim hopping term.
-    //
-    // [DIFFERS from original CPS red black implementation parity = (x+y+z+t+s)|2 ]
-    ////////////////////////////////////////////////////////////////////////////////
+  class WilsonFermion5DStatic { 
+  public:
+    // S-direction is INNERMOST and takes no part in the parity.
+    static const std::vector<int> directions;
+    static const std::vector<int> displacements;
+    const int npoint = 8;
+  };
  
-    class WilsonFermion5DStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static const std::vector<int> directions;
-      static const std::vector<int> displacements;
-      const int npoint = 8;
-    };
+  template<class Impl>
+  class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
+  {
+  public:
+    INHERIT_IMPL_TYPES(Impl);
+    typedef WilsonKernels<Impl> Kernels;
+    PmuStat stat;
    
-    template<class Impl>
-    class WilsonFermion5D : public WilsonKernels<Impl>, public WilsonFermion5DStatic
-    {
-    public:
-     INHERIT_IMPL_TYPES(Impl);
-     typedef WilsonKernels<Impl> Kernels;
-     PmuStat stat;
+    void Report(void);
+    void ZeroCounters(void);
+    double DhopCalls;
+    double DhopCommTime;
+    double DhopComputeTime;
    
-     void Report(void);
-     void ZeroCounters(void);
-     double DhopCalls;
-     double DhopCommTime;
-     double DhopComputeTime;
+    double DerivCalls;
+    double DerivCommTime;
+    double DerivComputeTime;
+    double DerivDhopComputeTime;
    
-     double DerivCalls;
-     double DerivCommTime;
-     double DerivComputeTime;
-     double DerivDhopComputeTime;
+    ///////////////////////////////////////////////////////////////
+    // Implement the abstract base
+    ///////////////////////////////////////////////////////////////
+    GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
+    GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
+    GridBase *FermionGrid(void)            { return _FiveDimGrid;}
+    GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
    
-      ///////////////////////////////////////////////////////////////
-      // Implement the abstract base
-      ///////////////////////////////////////////////////////////////
-      GridBase *GaugeGrid(void)              { return _FourDimGrid ;}
-      GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;}
-      GridBase *FermionGrid(void)            { return _FiveDimGrid;}
-      GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;}
+    // full checkerboard operations; leave unimplemented as abstract for now
+    virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+    virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
    
-      // full checkerboard operations; leave unimplemented as abstract for now
-      virtual RealD  M    (const FermionField &in, FermionField &out){assert(0); return 0.0;};
-      virtual RealD  Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
+    // half checkerboard operations; leave unimplemented as abstract for now
+    virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
    
-      // half checkerboard operations; leave unimplemented as abstract for now
-      virtual void   Meooe       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mooee       (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInv    (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+    virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
    
-      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
-      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+    // These can be overridden by fancy 5d chiral action
+    virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
    
-      // These can be overridden by fancy 5d chiral action
-      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
-      virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+    // Implement hopping term non-hermitian hopping term; half cb or both
+    // Implement s-diagonal DW
+    void DW    (const FermionField &in, FermionField &out,int dag);
+    void Dhop  (const FermionField &in, FermionField &out,int dag);
+    void DhopOE(const FermionField &in, FermionField &out,int dag);
+    void DhopEO(const FermionField &in, FermionField &out,int dag);
    
-      // Implement hopping term non-hermitian hopping term; half cb or both
-      // Implement s-diagonal DW
-      void DW    (const FermionField &in, FermionField &out,int dag);
-      void Dhop  (const FermionField &in, FermionField &out,int dag);
-      void DhopOE(const FermionField &in, FermionField &out,int dag);
-      void DhopEO(const FermionField &in, FermionField &out,int dag);
-
-      // add a DhopComm
+    // add a DhopComm
      // -- suboptimal interface will presently trigger multiple comms.
-      void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+    void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
    
-      ///////////////////////////////////////////////////////////////
-      // New methods added 
-      ///////////////////////////////////////////////////////////////
-      void DerivInternal(StencilImpl & st,
-			 DoubledGaugeField & U,
-			 GaugeField &mat,
-			 const FermionField &A,
-			 const FermionField &B,
-			 int dag);
+    ///////////////////////////////////////////////////////////////
+    // New methods added 
+    ///////////////////////////////////////////////////////////////
+    void DerivInternal(StencilImpl & st,
+		       DoubledGaugeField & U,
+		       GaugeField &mat,
+		       const FermionField &A,
+		       const FermionField &B,
+		       int dag);
    
-      void DhopInternal(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
+    void DhopInternal(StencilImpl & st,
+		      LebesgueOrder &lo,
+		      DoubledGaugeField &U,
+		      const FermionField &in, 
+		      FermionField &out,
+		      int dag);
    
-      // Constructors
-      WilsonFermion5D(GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      GridRedBlackCartesian &FourDimRedBlackGrid,
-		      double _M5,const ImplParams &p= ImplParams());
+    // Constructors
+    WilsonFermion5D(GaugeField &_Umu,
+		    GridCartesian         &FiveDimGrid,
+		    GridRedBlackCartesian &FiveDimRedBlackGrid,
+		    GridCartesian         &FourDimGrid,
+		    GridRedBlackCartesian &FourDimRedBlackGrid,
+		    double _M5,const ImplParams &p= ImplParams());
    
-      // Constructors
-      /*
+    // Constructors
+    /*
      WilsonFermion5D(int simd, 
-		      GaugeField &_Umu,
-		      GridCartesian         &FiveDimGrid,
-		      GridRedBlackCartesian &FiveDimRedBlackGrid,
-		      GridCartesian         &FourDimGrid,
-		      double _M5,const ImplParams &p= ImplParams());
-      */
+      GaugeField &_Umu,
+      GridCartesian         &FiveDimGrid,
+      GridRedBlackCartesian &FiveDimRedBlackGrid,
+      GridCartesian         &FourDimGrid,
+      double _M5,const ImplParams &p= ImplParams());
+    */
    
-      // DoubleStore
-      void ImportGauge(const GaugeField &_Umu);
+    // DoubleStore
+    void ImportGauge(const GaugeField &_Umu);
    
-      ///////////////////////////////////////////////////////////////
-      // Data members require to support the functionality
-      ///////////////////////////////////////////////////////////////
-    public:
+    ///////////////////////////////////////////////////////////////
+    // Data members require to support the functionality
+    ///////////////////////////////////////////////////////////////
+  public:
    
-      // Add these to the support from Wilson
-      GridBase *_FourDimGrid;
-      GridBase *_FourDimRedBlackGrid;
-      GridBase *_FiveDimGrid;
-      GridBase *_FiveDimRedBlackGrid;
+    // Add these to the support from Wilson
+    GridBase *_FourDimGrid;
+    GridBase *_FourDimRedBlackGrid;
+    GridBase *_FiveDimGrid;
+    GridBase *_FiveDimRedBlackGrid;
    
-      double                        M5;
-      int Ls;
+    double                        M5;
+    int Ls;
    
-      //Defines the stencils for even and odd
-      StencilImpl Stencil; 
-      StencilImpl StencilEven; 
-      StencilImpl StencilOdd; 
+    //Defines the stencils for even and odd
+    StencilImpl Stencil; 
+    StencilImpl StencilEven; 
+    StencilImpl StencilOdd; 
    
-      // Copy of the gauge field , with even and odd subsets
-      DoubledGaugeField Umu;
-      DoubledGaugeField UmuEven;
-      DoubledGaugeField UmuOdd;
+    // Copy of the gauge field , with even and odd subsets
+    DoubledGaugeField Umu;
+    DoubledGaugeField UmuEven;
+    DoubledGaugeField UmuOdd;
    
-      LebesgueOrder Lebesgue;
-      LebesgueOrder LebesgueEvenOdd;
+    LebesgueOrder Lebesgue;
+    LebesgueOrder LebesgueEvenOdd;
    
-      // Comms buffer
-      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
+    // Comms buffer
+    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
-    };
-  }
-}
+  };
+
+}}

 #endif
@@ -43,10 +43,9 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////

 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out) {
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						     SiteHalfSpinor *buf, int sF,
+						     int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -220,10 +219,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(

 // Need controls to do interior, exterior, or both
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out) {
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						  SiteHalfSpinor *buf, int sF,
+						  int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteHalfSpinor *chi_p;
@@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 };

 template <class Impl>
-void WilsonKernels<Impl>::DiracOptDhopDir(
-    StencilImpl &st, DoubledGaugeField &U,
-    commVector<SiteHalfSpinor> &buf, int sF,
-    int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
+					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
+
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteSpinor result;
@@ -32,175 +32,132 @@ directory
 #define GRID_QCD_DHOP_H

 namespace Grid {
+namespace QCD {

-  namespace QCD {
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Helper routines that implement Wilson stencil for a single site.
+  // Common to both the WilsonFermion and WilsonFermion5D
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class WilsonKernelsStatic { 
+ public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static int AsmOpt;  // these are a temporary hack
+  static int HandOpt; // these are a temporary hack
+};
 
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Helper routines that implement Wilson stencil for a single site.
-    // Common to both the WilsonFermion and WilsonFermion5D
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    class WilsonKernelsStatic { 
-    public:
-      // S-direction is INNERMOST and takes no part in the parity.
-      static int AsmOpt;  // these are a temporary hack
-      static int HandOpt; // these are a temporary hack
-    };
+template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
+ public:
   
-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
-    public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
   
-      INHERIT_IMPL_TYPES(Impl);
-      typedef FermionOperator<Impl> Base;
+public:
   
-    public:
-
-      template <bool EnableBool = true>
-      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
-	DiracOptDhopSite(
-			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			 commVector<SiteHalfSpinor> &buf,
-			 int sF, int sU, int Ls, int Ns, const FermionField &in,
-			 FermionField &out) {
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 #ifdef AVX512
-	if (AsmOpt) {
-	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
-						   in, out);
-
-	} else {
+    if (AsmOpt) {
+      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+    } else {
 #else
-	  {
+    {
 #endif
-	    for (int site = 0; site < Ns; site++) {
-	      for (int s = 0; s < Ls; s++) {
-		if (HandOpt)
-		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
-							    in, out);
-		else
-		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
-							       in, out);
-		sF++;
-	      }
-	      sU++;
-	    }
-	  }
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if (HandOpt)
+	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  else
+	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
 	}
-
-	template <bool EnableBool = true>
-	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-	  DiracOptDhopSite(
-			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			   commVector<SiteHalfSpinor> &buf,
-			   int sF, int sU, int Ls, int Ns, const FermionField &in,
-			   FermionField &out) {
-	  for (int site = 0; site < Ns; site++) {
-	    for (int s = 0; s < Ls; s++) {
-	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
-							   out);
-	      sF++;
-	    }
-	    sU++;
-	  }
-	}
-
-	template <bool EnableBool = true>
-	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
-				  void>::type
-	  DiracOptDhopSiteDag(
-			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			      commVector<SiteHalfSpinor> &buf,
-			      int sF, int sU, int Ls, int Ns, const FermionField &in,
-			      FermionField &out) {
-#ifdef AVX512
-				    if (AsmOpt) {
-				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
-										  Ns, in, out);
-				    } else {
-#else
-				      {
-#endif
-					for (int site = 0; site < Ns; site++) {
-					  for (int s = 0; s < Ls; s++) {
-					    if (HandOpt)
-					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
-											   in, out);
-					    else
-					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
-											      sU, in, out);
-					    sF++;
-					  }
-					  sU++;
-					}
-				      }
-				    }
-
-				    template <bool EnableBool = true>
-				      typename std::enable_if<
-				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
-				      void>::type
-				      DiracOptDhopSiteDag(
-							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							  commVector<SiteHalfSpinor> &buf,
-							  int sF, int sU, int Ls, int Ns, const FermionField &in,
-							  FermionField &out) {
-					for (int site = 0; site < Ns; site++) {
-					  for (int s = 0; s < Ls; s++) {
-					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
-											    in, out);
-					    sF++;
-					  }
-					  sU++;
-					}
-				      }
-
-				    void DiracOptDhopDir(
-							 StencilImpl &st, DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor> &buf,
-							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
-							 int gamma);
-
-	private:
-				    // Specialised variants
-				    void DiracOptGenericDhopSite(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
-								 int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptGenericDhopSiteDag(
-								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								    commVector<SiteHalfSpinor> &buf,
-								    int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptAsmDhopSite(
-							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor> &buf,
-							     int sF, int sU, int Ls, int Ns, const FermionField &in,
-							     FermionField &out);
-
-				    void DiracOptAsmDhopSiteDag(
-								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								commVector<SiteHalfSpinor> &buf,
-								int sF, int sU, int Ls, int Ns, const FermionField &in,
-								FermionField &out);
-
-				    void DiracOptHandDhopSite(
-							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							      commVector<SiteHalfSpinor> &buf,
-							      int sF, int sU, const FermionField &in, FermionField &out);
-
-				    void DiracOptHandDhopSiteDag(
-								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 commVector<SiteHalfSpinor> &buf,
-								 int sF, int sU, const FermionField &in, FermionField &out);
-
-	public:
-				    WilsonKernels(const ImplParams &p = ImplParams());
-				  };
-    
+	sU++;
      }
    }
+  }
     
+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
     
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
+	sF++;
+      }
+      sU++;
+    }
+  }
     
+  template <bool EnableBool = true>
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+#ifdef AVX512
+    if (AsmOpt) {
+      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+    } else {
+#else
+    {
+#endif
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  if (HandOpt)
+	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  else
+	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+    }
+  }

+  template <bool EnableBool = true>
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+
+    for (int site = 0; site < Ns; site++) {
+      for (int s = 0; s < Ls; s++) {
+	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
+    }
+  }
+
+  void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
+		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+      
+private:
+     // Specialised variants
+  void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int sF, int sU, const FermionField &in, FermionField &out);
+      
+  void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+				  int sF, int sU, const FermionField &in, FermionField &out);
+
+  void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
+
+  void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
+
+  void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			    int sF, int sU, const FermionField &in, FermionField &out);
+
+  void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
+			       int sF, int sU, const FermionField &in, FermionField &out);
+      
+public:
+
+  WilsonKernels(const ImplParams &p = ImplParams());
+
+};
+    
+}}

 #endif
@@ -33,31 +33,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 namespace Grid {
-  namespace QCD {
-    
-    ///////////////////////////////////////////////////////////
-    // Default to no assembler implementation
-    ///////////////////////////////////////////////////////////
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                             commVector<SiteHalfSpinor>  &buf,
-                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-    {
-      assert(0);
-    }
-    template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                                commVector<SiteHalfSpinor>  &buf,
-                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-    {
-      assert(0);
-    }
+namespace QCD {
    
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}

+template<class Impl> void 
+WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}

 #if defined(AVX512) 
    
-    
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
@@ -65,16 +61,16 @@ namespace Grid {
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
    
-    static Vector<vComplexF> signs;
+static Vector<vComplexF> signs;
    
-    int setupSigns(void ){
-      Vector<vComplexF> bother(2);
-      signs = bother;
-      vrsign(signs[0]);
-      visign(signs[1]);
-      return 1;
-    }
-    static int signInit = setupSigns();
+  int setupSigns(void ){
+    Vector<vComplexF> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+  static int signInit = setupSigns();
  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -84,17 +80,15 @@ namespace Grid {
 #define FX(A) WILSONASM_ ##A
  
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 commVector<SiteHalfSpinor>  &buf,
-							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
      
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    commVector<SiteHalfSpinor>  &buf,
-							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #undef VMOVIDUP
@@ -109,31 +103,26 @@ namespace Grid {
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								  commVector<SiteHalfSpinor>  &buf,
-								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #define KERNEL_DAG
-    template<>
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								     commVector<SiteHalfSpinor>  &buf,
-								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #endif

-
 #define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   commVector<SiteHalfSpinor>  &buf,\
+ \
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\

-
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
@@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-  }
-}
+
+}}

@@ -311,10 +311,9 @@ namespace Grid {
 namespace QCD {


-  template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out)
+template<class Impl> void 
+WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -554,10 +553,9 @@ namespace QCD {
  }
 }

-  template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       commVector<SiteHalfSpinor>  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out)
+template<class Impl>
+void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
@@ -798,38 +796,35 @@ namespace QCD {
  }
 }

-
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							SiteHalfSpinor *buf,
+							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							   SiteHalfSpinor *buf,
+							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     commVector<SiteHalfSpinor>  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }

-template<>
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								commVector<SiteHalfSpinor>  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out)
+template<> void 
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
@@ -840,12 +835,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 // Need Nc=3 though //

 #define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       commVector<SiteHalfSpinor>  &buf,\
-							       int ss,int sU,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-								  commVector<SiteHalfSpinor>  &buf,\
-								  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+						     int ss,int sU,const FermionField &in, FermionField &out); \
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
+							int ss,int sU,const FermionField &in, FermionField &out);

 INSTANTIATE_THEM(WilsonImplF);
 INSTANTIATE_THEM(WilsonImplD);
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
 	  else 
-	    Check._odata[i] = myStencil.comm_buf[SE->_offset];
+	    Check._odata[i] = myStencil.CommBuf()[SE->_offset];
 	}

 	Real nrmC = norm2(Check);
@@ -207,7 +207,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 	  else 
-	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
+	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
 	  int permute_type;
@@ -220,7 +220,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 	  else 
-	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
+	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
 	}
 	
 	setCheckerboard(Check,ECheck);