diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 1d90cace..580f5e30 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -26,7 +26,7 @@ namespace Grid {
     // and Methods:
     //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
     //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
+    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
     //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
     //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
     //
@@ -101,6 +101,7 @@ namespace Grid {
     typedef typename Impl::SiteSpinor               SiteSpinor;		\
     typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
     typedef typename Impl::Compressor               Compressor;		\
+    typedef typename Impl::StencilImpl              StencilImpl;	\
     typedef typename Impl::ImplParams ImplParams;
 
     ///////
@@ -112,7 +113,6 @@ namespace Grid {
 
       typedef ImplGauge<S,Nrepresentation> Gimpl;
 
-
       INHERIT_GIMPL_TYPES(Gimpl);
 
       template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@@ -128,10 +128,11 @@ namespace Grid {
 
       typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
       typedef WilsonImplParams ImplParams;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
       ImplParams Params;
       WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
         mult(&phi(),&U(mu),&chi());
       }
 
@@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
       typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 
       typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 
       typedef GparityWilsonImplParams ImplParams;
       ImplParams Params;
       GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
       
+
       // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
 
 	typedef SiteHalfSpinor vobj;
 	typedef typename SiteHalfSpinor::scalar_object sobj;
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index 053526cd..5e37238c 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -109,7 +109,7 @@ namespace QCD {
   ///////////////////////////////////
 
   template<class Impl>
-  void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
+  void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
@@ -123,7 +123,7 @@ namespace QCD {
     FermionField Atilde(B._grid);
     Atilde = A;
 
-    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+    st.HaloExchange(B,comm_buf,compressor);
     
     for(int mu=0;mu<Nd;mu++){
       
@@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
     
     Compressor compressor(dag);
     
-    Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    Stencil.HaloExchange(in,comm_buf,compressor);
     
 PARALLEL_FOR_LOOP
       for(int sss=0;sss<in._grid->oSites();sss++){
@@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
 
 
   template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) {
 
     assert((dag==DaggerNo) ||(dag==DaggerYes));
 
     Compressor compressor(dag);
-    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    st.HaloExchange(in,comm_buf,compressor);
     
     if ( dag == DaggerYes ) {
       if( HandOptDslash ) {
diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h
index bab8ff15..9a8c848a 100644
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -73,14 +73,14 @@ namespace Grid {
       ///////////////////////////////////////////////////////////////
       // Extra methods added by derived
       ///////////////////////////////////////////////////////////////
-      void DerivInternal(CartesianStencil & st,
+      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);
 
-      void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
 
 
@@ -108,9 +108,9 @@ namespace Grid {
       GridBase                     *  _cbgrid;
 
       //Defines the stencils for even and odd
-      CartesianStencil Stencil; 
-      CartesianStencil StencilEven; 
-      CartesianStencil StencilOdd; 
+      StencilImpl Stencil; 
+      StencilImpl StencilEven; 
+      StencilImpl StencilOdd; 
 
       // Copy of the gauge field , with even and odd subsets
       DoubledGaugeField Umu;
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 66ca67d5..8f22bc32 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -68,6 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 
   ImportGauge(_Umu);
+  commtime=0;
+  dslashtime=0;
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@@ -85,7 +87,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
   //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 
   Compressor compressor(DaggerNo);
-  Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+  Stencil.HaloExchange(in,comm_buf,compressor);
   
   int skip = (disp==1) ? 0 : 1;
 
@@ -105,7 +107,7 @@ PARALLEL_FOR_LOOP
 };
 
 template<class Impl>
-void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
+void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
@@ -122,7 +124,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
   FermionField Btilde(B._grid);
   FermionField Atilde(B._grid);
 
-  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+  st.HaloExchange(B,comm_buf,compressor);
 
   Atilde=A;
 
@@ -194,6 +196,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
   DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }
 
+
+template<class Impl>
+void WilsonFermion5D<Impl>::Report(void)
+{
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commstime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil merge  time "<<Stencil.mergetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil buf    time "<<Stencil.buftime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+}
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@@ -212,7 +235,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 }
 
 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
@@ -220,13 +243,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
 
   Compressor compressor(dag);
 
-  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+  commtime -=usecond();
+  st.HaloExchange(in,comm_buf,compressor);
+  commtime +=usecond();
   
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
   // Not loop ordering and data layout.
   // Designed to create 
   // - per thread reuse in L1 cache for U
   // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+  dslashtime -=usecond();
   if ( dag == DaggerYes ) {
     if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
@@ -274,6 +300,7 @@ PARALLEL_FOR_LOOP
       }
     }
   }
+  dslashtime +=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h
index b0d25309..e95fc2b3 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -31,7 +31,8 @@ namespace Grid {
     public:
      INHERIT_IMPL_TYPES(Impl);
      typedef WilsonKernels<Impl> Kernels;
-
+     double commtime;
+     double dslashtime;
       ///////////////////////////////////////////////////////////////
       // Implement the abstract base
       ///////////////////////////////////////////////////////////////
@@ -72,14 +73,14 @@ namespace Grid {
       ///////////////////////////////////////////////////////////////
       // New methods added 
       ///////////////////////////////////////////////////////////////
-      void DerivInternal(CartesianStencil & st,
+      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);
 
-      void DhopInternal(CartesianStencil & st,
+      void DhopInternal(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
@@ -97,6 +98,7 @@ namespace Grid {
       // DoubleStore
       void ImportGauge(const GaugeField &_Umu);
 
+      void Report(void);
       ///////////////////////////////////////////////////////////////
       // Data members require to support the functionality
       ///////////////////////////////////////////////////////////////
@@ -112,9 +114,9 @@ namespace Grid {
       int Ls;
 
       //Defines the stencils for even and odd
-      CartesianStencil Stencil; 
-      CartesianStencil StencilEven; 
-      CartesianStencil StencilOdd; 
+      StencilImpl Stencil; 
+      StencilImpl StencilEven; 
+      StencilImpl StencilOdd; 
 
       // Copy of the gauge field , with even and odd subsets
       DoubledGaugeField Umu;
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index a897921a..a37e7f9e 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -3,7 +3,7 @@ namespace Grid {
 namespace QCD {
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
 };
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					      int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
 }
 
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
 {
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index 78d7c6a0..9696fad5 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -17,36 +17,36 @@ namespace Grid {
      typedef FermionOperator<Impl> Base;
      
     public:
-     void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU,const FermionField &in, FermionField &out);
       
-     void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
 
-     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #define HANDOPT
 #ifdef HANDOPT
-     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			       int sF,int sU,const FermionField &in, FermionField &out);
 
-     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out);
 #else
 
-     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			       int sF,int sU,const FermionField &in, FermionField &out)
      {
        DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
      }
 
-     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out)
      {
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 9ccd18c6..fedfdd00 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -73,7 +73,7 @@ namespace Grid {
 namespace QCD {
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 {
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index fee1b0fb..b263ea83 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -282,7 +282,7 @@ namespace QCD {
 
 #ifdef HANDOPT
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
 }
 
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out)
 {