diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
new file mode 100644
index 00000000..22b7725e
--- /dev/null
+++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h
@@ -0,0 +1,241 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_PREC_GCR_NON_HERM_H
+#define GRID_PREC_GCR_NON_HERM_H
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//VPGCR Abe and Zhang, 2005.
+//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
+//Computing and Information Volume 2, Number 2, Pages 147-161
+//NB. Likely not original reference since they are focussing on a preconditioner variant.
+//    but VPGCR was nicely written up in their paper
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+NAMESPACE_BEGIN(Grid);
+
+#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" " 
+
+template<class Field>
+class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
+public:                                                
+
+  RealD   Tolerance;
+  Integer MaxIterations;
+  int verbose;
+  int mmax;
+  int nstep;
+  int steps;
+  int level;
+  GridStopWatch PrecTimer;
+  GridStopWatch MatTimer;
+  GridStopWatch LinalgTimer;
+
+  LinearFunction<Field>     &Preconditioner;
+  LinearOperatorBase<Field> &Linop;
+
+  void Level(int lv) { level=lv; };
+
+  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+    Tolerance(tol), 
+    MaxIterations(maxit),
+    Linop(_Linop),
+    Preconditioner(Prec),
+    mmax(_mmax),
+    nstep(_nstep)
+  { 
+    level=1;
+    verbose=1;
+  };
+
+  void operator() (const Field &src, Field &psi){
+
+    psi=Zero();
+    RealD cp, ssq,rsq;
+    ssq=norm2(src);
+    rsq=Tolerance*Tolerance*ssq;
+      
+    Field r(src.Grid());
+
+    PrecTimer.Reset();
+    MatTimer.Reset();
+    LinalgTimer.Reset();
+
+    GridStopWatch SolverTimer;
+    SolverTimer.Start();
+
+    steps=0;
+    for(int k=0;k<MaxIterations;k++){
+
+      cp=GCRnStep(src,psi,rsq);
+
+      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
+
+      if(cp<rsq) {
+
+	SolverTimer.Stop();
+
+	Linop.Op(psi,r);
+	axpy(r,-1.0,src,r);
+	RealD tr = norm2(r);
+	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
+		 << " computed residual "<<sqrt(cp/ssq)
+		 << " true residual "    <<sqrt(tr/ssq)
+		 << " target "           <<Tolerance <<std::endl;
+
+	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	return;
+      }
+
+    }
+    GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
+    //    assert(0);
+  }
+
+  RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
+
+    RealD cp;
+    ComplexD a, b, zAz;
+    RealD zAAz;
+    ComplexD rq;
+
+    GridBase *grid = src.Grid();
+
+    Field r(grid);
+    Field z(grid);
+    Field tmp(grid);
+    Field ttmp(grid);
+    Field Az(grid);
+
+    ////////////////////////////////
+    // history for flexible orthog
+    ////////////////////////////////
+    std::vector<Field> q(mmax,grid);
+    std::vector<Field> p(mmax,grid);
+    std::vector<RealD> qq(mmax);
+      
+    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
+
+    //////////////////////////////////
+    // initial guess x0 is taken as nonzero.
+    // r0=src-A x0 = src
+    //////////////////////////////////
+    MatTimer.Start();
+    Linop.Op(psi,Az);
+    zAz = innerProduct(Az,psi);
+    zAAz= norm2(Az);
+    MatTimer.Stop();
+    
+
+    LinalgTimer.Start();
+    r=src-Az;
+    LinalgTimer.Stop();
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    
+    /////////////////////
+    // p = Prec(r)
+    /////////////////////
+
+    PrecTimer.Start();
+    Preconditioner(r,z);
+    PrecTimer.Stop();
+
+    MatTimer.Start();
+    Linop.Op(z,Az);
+    MatTimer.Stop();
+
+    LinalgTimer.Start();
+
+    zAz = innerProduct(Az,psi);
+    zAAz= norm2(Az);
+
+    //p[0],q[0],qq[0] 
+    p[0]= z;
+    q[0]= Az;
+    qq[0]= zAAz;
+    
+    cp =norm2(r);
+    LinalgTimer.Stop();
+
+    for(int k=0;k<nstep;k++){
+
+      steps++;
+
+      int kp     = k+1;
+      int peri_k = k %mmax;
+      int peri_kp= kp%mmax;
+
+      LinalgTimer.Start();
+      rq= innerProduct(q[peri_k],r); // what if rAr not real?
+      a = rq/qq[peri_k];
+
+      axpy(psi,a,p[peri_k],psi);         
+
+      cp = axpy_norm(r,-a,q[peri_k],r);
+      LinalgTimer.Stop();
+
+      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+
+      if((k==nstep-1)||(cp<rsq)){
+	return cp;
+      }
+
+
+      PrecTimer.Start();
+      Preconditioner(r,z);// solve Az = r
+      PrecTimer.Stop();
+
+      MatTimer.Start();
+      Linop.Op(z,Az);
+      MatTimer.Stop();
+      zAz = innerProduct(Az,psi);
+      zAAz= norm2(Az);
+
+      LinalgTimer.Start();
+
+      q[peri_kp]=Az;
+      p[peri_kp]=z;
+
+      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
+      for(int back=0;back<northog;back++){
+
+	int peri_back=(k-back)%mmax;   	  assert((k-back)>=0);
+
+	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+	p[peri_kp]=p[peri_kp]+b*p[peri_back];
+	q[peri_kp]=q[peri_kp]+b*q[peri_back];
+
+      }
+      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
+      LinalgTimer.Stop();
+    }
+    assert(0); // never reached
+    return cp;
+  }
+};
+NAMESPACE_END(Grid);
+#endif
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 6c6dd7d8..ebb3162b 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -52,41 +52,79 @@ public:
   pointer allocate(size_type __n, const void* _p= 0)
   { 
     size_type bytes = __n*sizeof(_Tp);
-
     profilerAllocate(bytes);
-
     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
-    
     assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
-
     return ptr;
   }
 
   void deallocate(pointer __p, size_type __n) 
   { 
     size_type bytes = __n * sizeof(_Tp);
-
     profilerFree(bytes);
-
     MemoryManager::CpuFree((void *)__p,bytes);
   }
 
+  // FIXME: hack for the copy constructor, eventually it must be avoided
+  //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
+  void construct(pointer __p, const _Tp& __val) { assert(0);};
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+template<typename _Tp>
+class uvmAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef uvmAllocator<_Tp1> other; };
+  uvmAllocator() throw() { }
+  uvmAllocator(const uvmAllocator&) throw() { }
+  template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
+  ~uvmAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    size_type bytes = __n*sizeof(_Tp);
+    profilerAllocate(bytes);
+    _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
+    assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type __n) 
+  { 
+    size_type bytes = __n * sizeof(_Tp);
+    profilerFree(bytes);
+    MemoryManager::SharedFree((void *)__p,bytes);
+  }
+
   // FIXME: hack for the copy constructor, eventually it must be avoided
   void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
   //void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
   void destroy(pointer __p) { };
 };
-template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
-template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+template<typename _Tp>  inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using commAllocator = alignedAllocator<T>;
-template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
-template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using commAllocator = uvmAllocator<T>;
+template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
+template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
+//template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 
 NAMESPACE_END(Grid);
 
diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index 6d638b60..e11ce948 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -7,13 +7,24 @@ NAMESPACE_BEGIN(Grid);
 #define CpuSmall (1)
 #define Acc      (2)
 #define AccSmall (3)
+#define Shared   (4)
+#define SharedSmall (5)
+uint64_t total_shared;
+uint64_t total_device;
+uint64_t total_host;;
+void MemoryManager::PrintBytes(void)
+{
+  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+}
 
 //////////////////////////////////////////////////////////////////////
 // Data tables for recently freed pooiniter caches
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType];
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
 
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
@@ -21,49 +32,86 @@ int MemoryManager::Ncache[MemoryManager::NallocType];
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
   void *ptr = (void *) Lookup(bytes,Acc);
-
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocDevice(bytes);
-    //    std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::endl;
+    total_device+=bytes;
   }
-
   return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
   void *__freeme = Insert(ptr,bytes,Acc);
-
-  if ( __freeme ) acceleratorFreeDevice(__freeme);
+  if ( __freeme ) {
+    acceleratorFreeDevice(__freeme);
+    total_device-=bytes;
+    //    PrintBytes();
+  }
 }
+void *MemoryManager::SharedAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Shared);
+  if ( ptr == (void *) NULL ) {
+    ptr = (void *) acceleratorAllocShared(bytes);
+    total_shared+=bytes;
+    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    PrintBytes();
+  }
+  return ptr;
+}
+void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
+{
+  void *__freeme = Insert(ptr,bytes,Shared);
+  if ( __freeme ) {
+    acceleratorFreeShared(__freeme);
+    total_shared-=bytes;
+    //    PrintBytes();
+  }
+}
+#ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
   void *ptr = (void *) Lookup(bytes,Cpu);
-
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocShared(bytes);
-    //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::endl;
+    total_host+=bytes;
   }
-
   return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
   NotifyDeletion(_ptr);
-
-  // If present remove entry and free accelerator too.
-  // Can we ever hit a free event with a view still in scope?
   void *__freeme = Insert(_ptr,bytes,Cpu);
-  if ( __freeme ) acceleratorFreeShared(__freeme);
+  if ( __freeme ) { 
+    acceleratorFreeShared(__freeme);
+    total_host-=bytes;
+  }
 }
+#else
+void *MemoryManager::CpuAllocate(size_t bytes)
+{
+  void *ptr = (void *) Lookup(bytes,Cpu);
+  if ( ptr == (void *) NULL ) {
+    ptr = (void *) acceleratorAllocCpu(bytes);
+    total_host+=bytes;
+  }
+  return ptr;
+}
+void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
+{
+  NotifyDeletion(_ptr);
+  void *__freeme = Insert(_ptr,bytes,Cpu);
+  if ( __freeme ) { 
+    acceleratorFreeCpu(__freeme);
+    total_host-=bytes;
+  }
+}
+#endif
+
 //////////////////////////////////////////
 // call only once
 //////////////////////////////////////////
 void MemoryManager::Init(void)
 {
-  Ncache[Cpu] = 8;
-  Ncache[Acc] = 8;
-  Ncache[CpuSmall] = 32;
-  Ncache[AccSmall] = 32;
 
   char * str;
   int Nc;
@@ -75,6 +123,7 @@ void MemoryManager::Init(void)
     if ( (Nc>=0) && (Nc < NallocCacheMax)) {
       Ncache[Cpu]=Nc;
       Ncache[Acc]=Nc;
+      Ncache[Shared]=Nc;
     }
   }
 
@@ -84,6 +133,7 @@ void MemoryManager::Init(void)
     if ( (Nc>=0) && (Nc < NallocCacheMax)) {
       Ncache[CpuSmall]=Nc;
       Ncache[AccSmall]=Nc;
+      Ncache[SharedSmall]=Nc;
     }
   }
   std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 5dcb9918..f158bade 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -44,14 +44,14 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////
 enum ViewAdvise {
  AdviseDefault       = 0x0,    // Regular data
- AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
+ AdviseInfrequentUse = 0x1     // Advise that the data is used infrequently.  This can
                                // significantly influence performance of bulk storage.
  
- AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
+ // AdviseTransient      = 0x2,   // Data will mostly be read.  On some architectures
                                // enables read-only copies of memory to be kept on
                                // host and device.
 
- AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device
+ // AdviseAcceleratorWriteDiscard = 0x4  // Field will be written in entirety on device
 
 };
 
@@ -80,7 +80,7 @@ private:
   } AllocationCacheEntry;
 
   static const int NallocCacheMax=128; 
-  static const int NallocType=4;
+  static const int NallocType=6;
   static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
   static int Victim[NallocType];
   static int Ncache[NallocType];
@@ -95,9 +95,11 @@ private:
 
   static void *AcceleratorAllocate(size_t bytes);
   static void  AcceleratorFree    (void *ptr,size_t bytes);
-
+  static void PrintBytes(void);
  public:
   static void Init(void);
+  static void *SharedAllocate(size_t bytes);
+  static void  SharedFree    (void *ptr,size_t bytes);
   static void *CpuAllocate(size_t bytes);
   static void  CpuFree    (void *ptr,size_t bytes);
 
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 67709c94..c2955485 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -62,7 +62,6 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
   for(int i=0;i<nthread;i++){
     ssum = ssum+sumarray[i];
   } 
-  
   return ssum;
 }
 template<class vobj>
@@ -93,7 +92,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
     ssum = ssum+sumarray[i];
   } 
   
-  return ssum;
+  typedef typename vobj::scalar_object ssobj;
+  ssobj ret = ssum;
+  return ret;
 }
 
 
@@ -154,7 +155,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
   const uint64_t sites = grid->oSites();
   
   // Might make all code paths go this way.
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
+  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
   Vector<inner_t> inner_tmp(sites);
   auto inner_tmp_v = &inner_tmp[0];
     
@@ -163,16 +164,16 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
     autoView( right_v,right, AcceleratorRead);
 
     // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
-      })
+    accelerator_for( ss, sites, 1,{
+	auto x_l = left_v[ss];
+	auto y_l = right_v[ss];
+	inner_tmp_v[ss]=innerProductD(x_l,y_l);
+    });
   }
 
   // This is in single precision and fails some tests
-  // Need a sumD that sums in double
-  nrm = TensorRemove(sumD(inner_tmp_v,sites));  
+  auto anrm = sum(inner_tmp_v,sites);  
+  nrm = anrm;
   return nrm;
 }
 
@@ -218,16 +219,16 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   autoView( y_v, y, AcceleratorRead);
   autoView( z_v, z, AcceleratorWrite);
 
-  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
+  typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
   Vector<inner_t> inner_tmp(sites);
   auto inner_tmp_v = &inner_tmp[0];
 
-  accelerator_for( ss, sites, nsimd,{
-      auto tmp = a*x_v(ss)+b*y_v(ss);
-      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
-      coalescedWrite(z_v[ss],tmp);
+  accelerator_for( ss, sites, 1,{
+      auto tmp = a*x_v[ss]+b*y_v[ss];
+      inner_tmp_v[ss]=innerProductD(tmp,tmp);
+      z_v[ss]=tmp;
   });
-  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
   grid->GlobalSum(nrm);
   return nrm; 
 }
@@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
 
   GridBase *grid = left.Grid();
 
-
   const uint64_t nsimd = grid->Nsimd();
   const uint64_t sites = grid->oSites();
 
   // GPU
-  typedef decltype(innerProduct(vobj(),vobj())) inner_t;
-  typedef decltype(innerProduct(vobj(),vobj())) norm_t;
+  typedef decltype(innerProductD(vobj(),vobj())) inner_t;
+  typedef decltype(innerProductD(vobj(),vobj())) norm_t;
   Vector<inner_t> inner_tmp(sites);
-  Vector<norm_t> norm_tmp(sites);
+  Vector<norm_t>  norm_tmp(sites);
   auto inner_tmp_v = &inner_tmp[0];
   auto norm_tmp_v = &norm_tmp[0];
   {
     autoView(left_v,left, AcceleratorRead);
     autoView(right_v,right,AcceleratorRead);
-    accelerator_for( ss, sites, nsimd,{
-	auto left_tmp = left_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
-	coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
+    accelerator_for( ss, sites, 1,{
+	auto left_tmp = left_v[ss];
+	inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
+        norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
       });
   }
 
-  tmp[0] = TensorRemove(sumD(inner_tmp_v,sites));
-  tmp[1] = TensorRemove(sumD(norm_tmp_v,sites));
+  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
+  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
 
   grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
   ip = tmp[0];
diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h
index febb315e..14675b11 100644
--- a/Grid/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -1,5 +1,13 @@
 #pragma once
 
+#define CPS_MD_TIME 
+
+#ifdef CPS_MD_TIME
+#define HMC_MOMENTUM_DENOMINATOR (2.0)
+#else
+#define HMC_MOMENTUM_DENOMINATOR (1.0)
+#endif
+
 NAMESPACE_BEGIN(Grid);
 
 template <class S>
@@ -20,7 +28,9 @@ public:
   typedef Field              PropagatorField;
     
   static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+    RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
     gaussian(pRNG, P);
+    P *= scale; 
   }
 
   static inline Field projectForce(Field& P){return P;}
@@ -66,7 +76,7 @@ public:
   }
     
   static void FreePropagator(const Field &in, Field &out,
-			     const Field &momKernel)
+           const Field &momKernel)
   {
     FFT   fft((GridCartesian *)in.Grid());
     Field inFT(in.Grid());
@@ -139,14 +149,17 @@ public:
 
     static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
     {
+      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
 #ifndef USE_FFT_ACCELERATION
     Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+    
 #else
 
       Field Pgaussian(P.Grid()), Pp(P.Grid());
       ComplexField p2(P.Grid()); p2 = zero;
       RealD M = FFT_MASS;
-      
+
+
       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
 
       FFT theFFT((GridCartesian*)P.Grid());
@@ -156,17 +169,17 @@ public:
       p2 = sqrt(p2);
       Pp *= p2;
       theFFT.FFT_all_dim(P, Pp, FFT::backward);
-
 #endif //USE_FFT_ACCELERATION
+      P *= scale; 
   }
 
-  static inline Field projectForce(Field& P) {return P;}
+  static inline Field projectForce(Field& P) {return Ta(P);}
 
     static inline void update_field(Field &P, Field &U, double ep)
     {
 #ifndef USE_FFT_ACCELERATION
       double t0=usecond(); 
-    U += P*ep;
+      U += P*ep;
       double t1=usecond();
       double total_time = (t1-t0)/1e6;
       std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index cfb19d69..b268b684 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -7,6 +7,7 @@
  Copyright (C) 2019
  
  Author: Felix Erben <felix.erben@ed.ac.uk>
+ Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
 
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -58,9 +59,12 @@ public:
 				 const Gamma GammaA_right,
 				 const Gamma GammaB_right,
 				 const int parity,
-				 const int * wick_contractions,
+				 const bool * wick_contractions,
   				 robj &result);
   public:
+  static void Wick_Contractions(std::string qi, 
+                 std::string qf, 
+                 bool* wick_contractions);
   static void ContractBaryons(const PropagatorField &q1_left,
 				 const PropagatorField &q2_left,
 				 const PropagatorField &q3_left,
@@ -68,8 +72,7 @@ public:
 				 const Gamma GammaB_left,
 				 const Gamma GammaA_right,
 				 const Gamma GammaB_right,
-				 const char * quarks_left,
-				 const char * quarks_right,
+				 const bool* wick_contractions,
 				 const int parity,
 				 ComplexField &baryon_corr);
   template <class mobj, class robj>
@@ -80,10 +83,59 @@ public:
 				 const Gamma GammaB_left,
 				 const Gamma GammaA_right,
 				 const Gamma GammaB_right,
-				 const char * quarks_left,
-				 const char * quarks_right,
+				 const bool* wick_contractions,
 				 const int parity,
+				 const int nt,
 				 robj &result);
+  private:
+  template <class mobj, class mobj2, class robj>
+  static void Baryon_Gamma_3pt_Group1_Site(
+           const mobj &Dq1_ti,
+           const mobj2 &Dq2_spec,
+           const mobj2 &Dq3_spec,
+           const mobj &Dq4_tf,
+                   const Gamma GammaJ,
+                   const Gamma GammaBi,
+                   const Gamma GammaBf,
+           int wick_contraction,
+           robj &result);
+
+  template <class mobj, class mobj2, class robj>
+  static void Baryon_Gamma_3pt_Group2_Site(
+           const mobj2 &Dq1_spec,
+           const mobj &Dq2_ti,
+           const mobj2 &Dq3_spec,
+           const mobj &Dq4_tf,
+                   const Gamma GammaJ,
+                   const Gamma GammaBi,
+                   const Gamma GammaBf,
+           int wick_contraction,
+           robj &result);
+
+  template <class mobj, class mobj2, class robj>
+  static void Baryon_Gamma_3pt_Group3_Site(
+           const mobj2 &Dq1_spec,
+           const mobj2 &Dq2_spec,
+           const mobj &Dq3_ti,
+           const mobj &Dq4_tf,
+                   const Gamma GammaJ,
+                   const Gamma GammaBi,
+                   const Gamma GammaBf,
+           int wick_contraction,
+           robj &result);
+  public:
+  template <class mobj>
+  static void Baryon_Gamma_3pt(
+           const PropagatorField &q_ti,
+           const mobj &Dq_spec1,
+           const mobj &Dq_spec2,
+           const PropagatorField &q_tf,
+           int group,
+           int wick_contraction,
+                   const Gamma GammaJ,
+                   const Gamma GammaBi,
+                   const Gamma GammaBf,
+           SpinMatrixField &stn_corr);
   private: 
   template <class mobj, class mobj2, class robj>
   static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop,
@@ -166,111 +218,137 @@ const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 const int * wick_contraction,
-						 robj &result)
+                const mobj &D2,
+                const mobj &D3,
+                         const Gamma GammaA_i,
+                         const Gamma GammaB_i,
+                         const Gamma GammaA_f,
+                         const Gamma GammaB_f,
+                const int parity,
+                const bool * wick_contraction,
+                robj &result)
 {
 
     Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-    auto gD1a = GammaA_left * GammaA_right * D1;
-    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (Real)parity * gD1b);
-    auto gD3 = GammaB_right * D3;
-    auto D2g = D2 * GammaB_left;
-    auto pD1g = pD1 * GammaB_left;
-    auto gD3g = gD3 * GammaB_left;
+    
+    auto D1_GAi =  D1 * GammaA_i;
+    auto D1_GAi_g4 = D1_GAi * g4;
+    auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4);
+    auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P;
+    auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P;
 
-    for (int ie_left=0; ie_left < 6 ; ie_left++){
-      int a_left = epsilon[ie_left][0]; //a
-      int b_left = epsilon[ie_left][1]; //b
-      int c_left = epsilon[ie_left][2]; //c
-      for (int ie_right=0; ie_right < 6 ; ie_right++){
-        int a_right = epsilon[ie_right][0]; //a'
-        int b_right = epsilon[ie_right][1]; //b'
-        int c_right = epsilon[ie_right][2]; //c'
-	Real ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+    auto D2_GBi = D2 * GammaB_i;
+    auto GBf_D2_GBi = GammaB_f * D2_GBi;
+    auto GAf_D2_GBi = GammaA_f * D2_GBi;
+
+    auto GBf_D3 = GammaB_f * D3;
+    auto GAf_D3 = GammaA_f * D3;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        int a_f = epsilon[ie_f][0]; //a
+        int b_f = epsilon[ie_f][1]; //b
+        int c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        int a_i = epsilon[ie_i][0]; //a'
+        int b_i = epsilon[ie_i][1]; //b'
+        int c_i = epsilon[ie_i][2]; //c'
+
+        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
         //This is the \delta_{456}^{123} part
-	if (wick_contraction[0]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	    for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-	      auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	      auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	      result()()() += eepD1*D2g_ab*gD3_ab;
-            }}
-	  }
-  	}	  
+        if (wick_contraction[0]){
+            for (int rho=0; rho<Ns; rho++){
+                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() += ee  * GAf_D1_GAi_P_rr_cc
+                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
+                                        * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
+                }}
+            }
+        }   
         //This is the \delta_{456}^{231} part
-	if (wick_contraction[1]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-              auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-	      auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	      result()()() += eepD1g_gb*D2_ab*gD3_ag;
-            }
-	  }}
-        }	  
-        //This is the \delta_{456}^{312} part
-	if (wick_contraction[2]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-              auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	      auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-                result()()() += eepD1_gb*D2_ag*gD3g_ab;
-            }
-	  }}
-        }	  
-        //This is the \delta_{456}^{132} part
-	if (wick_contraction[3]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	    for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-	      auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	      auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-    	      result()()() -= eepD1*D2_ab*gD3g_ab;
+        if (wick_contraction[1]){
+            for (int rho=0; rho<Ns; rho++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() += ee  * D1_GAi_P_ar_ac
+                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
+                                        * GAf_D3        ()(rho,beta_i)(c_f,b_i);
+                }
             }}
-	  }
-        }	  
+        }   
+        //This is the \delta_{456}^{312} part
+        if (wick_contraction[2]){
+            for (int rho=0; rho<Ns; rho++){
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() += ee  * GBf_D1_GAi_P_ar_bc
+                                        * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
+                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
+                }
+            }}
+        }   
+        //This is the \delta_{456}^{132} part
+        if (wick_contraction[3]){
+            for (int rho=0; rho<Ns; rho++){
+                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
+                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() -= ee  * GAf_D1_GAi_P_rr_cc
+                                        * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
+                                        * D3            ()(alpha_f,beta_i)(a_f,b_i);
+              }
+        }}
+        }   
         //This is the \delta_{456}^{321} part
-	if (wick_contraction[4]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-              auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	      auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	      result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-            }
-	  }}
-        }	  
+        if (wick_contraction[4]){
+            for (int rho=0; rho<Ns; rho++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() -= ee  * GBf_D1_GAi_P_ar_bc
+                                        * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
+                                        * GAf_D3    ()(rho,beta_i)(c_f,b_i);
+                }
+            }}
+        }   
         //This is the \delta_{456}^{213} part
-	if (wick_contraction[5]){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	    for (int beta_left=0; beta_left<Ns; beta_left++){
-              auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-	      auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-    	      result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-            }
-	  }}
-        }	  
-      }
+        if (wick_contraction[5]){
+            for (int rho=0; rho<Ns; rho++){
+            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
+                for (int beta_i=0; beta_i<Ns; beta_i++){
+                    result()()() -= ee  * D1_GAi_P_ar_ac
+                                        * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
+                                        * GBf_D3        ()(alpha_f,beta_i)(b_f,b_i);
+                }
+            }}
+        }
+    }}
+}
+
+/* Computes which wick contractions should be performed for a    *
+ * baryon 2pt function given the initial and finals state quark  *
+ * flavours.                                                     *
+ * The array wick_contractions must be of length 6               */
+template<class FImpl>
+void BaryonUtils<FImpl>::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) {
+    const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+    for (int ie=0; ie < 6 ; ie++) {
+        wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
+                                    && qi[0] == qf[epsilon[ie][0]] 
+                                    && qi[1] == qf[epsilon[ie][1]] 
+                                    && qi[2] == qf[epsilon[ie][2]]);
     }
 }
 
+/* The array wick_contractions must be of length 6. The order     * 
+ * corresponds to the to that shown in the Hadrons documentation  *
+ * at https://aportelli.github.io/Hadrons-doc/#/mcontraction      *
+ * This can be computed from the quark flavours using the         *
+ * Wick_Contractions function above                               */
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const PropagatorField &q2_left,
@@ -279,8 +357,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 				                 const Gamma GammaB_left,
 				                 const Gamma GammaA_right,
 		                 		 const Gamma GammaB_right,
-						 const char * quarks_left,
-						 const char * quarks_right,
+						 const bool* wick_contractions,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
@@ -288,7 +365,6 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
   std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
   std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
   std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
@@ -297,11 +373,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
   GridBase *grid = q1_left.Grid();
-
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
-
+  
   autoView(vbaryon_corr, baryon_corr,CpuWrite);
   autoView( v1 , q1_left, CpuRead);
   autoView( v2 , q2_left, CpuRead);
@@ -311,10 +383,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
   for (int ie=0; ie < 6 ; ie++){
     if(ie==0 or ie==3){
-       bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contraction[ie];
+       bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
     }
     else{
-       bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contraction[ie];
+       bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
     }
   }
   Real t=0.;
@@ -325,7 +397,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D2 = v2[ss];
     auto D3 = v3[ss];
     vobj result=Zero();
-    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
 
@@ -334,6 +406,12 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
 
 }
+
+/* The array wick_contractions must be of length 6. The order     * 
+ * corresponds to the to that shown in the Hadrons documentation  *
+ * at https://aportelli.github.io/Hadrons-doc/#/mcontraction      *
+ * This can also be computed from the quark flavours using the    *
+ * Wick_Contractions function above                               */
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
@@ -343,16 +421,15 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 				                 const Gamma GammaB_left,
 				                 const Gamma GammaA_right,
 		                 		 const Gamma GammaB_right,
-						 const char * quarks_left,
-						 const char * quarks_right,
+						 const bool* wick_contractions,
 						 const int parity,
+						 const int nt,
 						 robj &result)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
   std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
   std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
   std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
@@ -360,17 +437,347 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
  
   assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
-
-  result=Zero();
-  baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+  for (int t=0; t<nt; t++) {
+    baryon_site(D1[t],D2[t],D3[t],GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result[t]);
+  }
 }
 
 /***********************************************************************
  * End of Baryon 2pt-function code.                                    *
  *                                                                     *
+ * The following code is for baryonGamma3pt function                   *
+ **********************************************************************/
+
+/* Dq1_ti is a quark line from t_i to t_J
+ * Dq2_spec is a quark line from t_i to t_f
+ * Dq3_spec is a quark line from t_i to t_f
+ * Dq4_tf is a quark line from t_f to t_J */
+template<class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group1_Site(
+                        const mobj &Dq1_ti,
+                        const mobj2 &Dq2_spec,
+                        const mobj2 &Dq3_spec,
+                        const mobj &Dq4_tf,
+                                const Gamma GammaJ,
+                                const Gamma GammaBi,
+                                const Gamma GammaBf,
+                        int wick_contraction,
+                        robj &result)
+{
+    Gamma g5(Gamma::Algebra::Gamma5); 
+
+    auto adjD4_g_D1     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti;
+    auto Gf_adjD4_g_D1  = GammaBf * adjD4_g_D1;
+    auto D2_Gi          = Dq2_spec * GammaBi;
+    auto Gf_D2_Gi       = GammaBf * D2_Gi;
+    auto Gf_D3          = GammaBf * Dq3_spec;
+
+    int a_f, b_f, c_f;
+    int a_i, b_i, c_i;
+
+    Real ee;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        a_f = epsilon[ie_f][0]; //a
+        b_f = epsilon[ie_f][1]; //b
+        c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        a_i = epsilon[ie_i][0]; //a'
+        b_i = epsilon[ie_i][1]; //b'
+        c_i = epsilon[ie_i][2]; //c'
+
+        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+        for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto D2_Gi_ab_aa        = D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
+            auto Gf_D3_ab_bb        = Gf_D3     ()(alpha_f,beta_i)(b_f,b_i);
+            auto Gf_D2_Gi_ab_ba     = Gf_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
+            auto Dq3_spec_ab_ab     = Dq3_spec  ()(alpha_f,beta_i)(a_f,b_i);
+
+            for (int gamma_i=0; gamma_i<Ns; gamma_i++){
+                auto ee_adjD4_g_D1_ag_ac        = ee * adjD4_g_D1   ()(alpha_f,gamma_i)(a_f,c_i);
+                auto ee_Gf_adjD4_g_D1_ag_bc     = ee * Gf_adjD4_g_D1()(alpha_f,gamma_i)(b_f,c_i);
+            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
+                auto ee_adjD4_g_D1_gg_cc        = ee * adjD4_g_D1   ()(gamma_f,gamma_i)(c_f,c_i);
+                auto Dq3_spec_gb_cb             = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
+                auto D2_Gi_gb_ca                = D2_Gi             ()(gamma_f,beta_i)(c_f,a_i);
+
+
+                if(wick_contraction == 1) { // Do contraction I1
+                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_gg_cc
+                                                        * D2_Gi_ab_aa
+                                                        * Gf_D3_ab_bb;
+                }
+                if(wick_contraction == 2) { // Do contraction I2
+                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_ag_ac
+                                                        * Gf_D2_Gi_ab_ba
+                                                        * Dq3_spec_gb_cb;
+                }
+                if(wick_contraction == 3) { // Do contraction I3
+                    result()(gamma_f,gamma_i)() -= ee_Gf_adjD4_g_D1_ag_bc
+                                                        * D2_Gi_gb_ca
+                                                        * Dq3_spec_ab_ab;
+                }
+                if(wick_contraction == 4) { // Do contraction I4
+                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_gg_cc
+                                                        * Gf_D2_Gi_ab_ba
+                                                        * Dq3_spec_ab_ab;
+                }
+                if(wick_contraction == 5) { // Do contraction I5
+                    result()(gamma_f,gamma_i)() += ee_Gf_adjD4_g_D1_ag_bc
+                                                        * D2_Gi_ab_aa
+                                                        * Dq3_spec_gb_cb;
+                }
+                if(wick_contraction == 6) { // Do contraction I6
+                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_ag_ac
+                                                        * D2_Gi_gb_ca
+                                                        * Gf_D3_ab_bb;
+                }
+            }}
+        }}
+    }}
+}
+
+/* Dq1_spec is a quark line from t_i to t_f
+ * Dq2_ti is a quark line from t_i to t_J
+ * Dq3_spec is a quark line from t_i to t_f
+ * Dq4_tf is a quark line from t_f to t_J */
+template<class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group2_Site(
+                        const mobj2 &Dq1_spec,
+                        const mobj &Dq2_ti,
+                        const mobj2 &Dq3_spec,
+                        const mobj &Dq4_tf,
+                                const Gamma GammaJ,
+                                const Gamma GammaBi,
+                                const Gamma GammaBf,
+                        int wick_contraction,
+                        robj &result)
+{
+    Gamma g5(Gamma::Algebra::Gamma5); 
+
+    auto adjD4_g_D2_Gi      = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi;
+    auto Gf_adjD4_g_D2_Gi   = GammaBf * adjD4_g_D2_Gi;
+    auto Gf_D1              = GammaBf * Dq1_spec;
+    auto Gf_D3              = GammaBf * Dq3_spec;
+
+    int a_f, b_f, c_f;
+    int a_i, b_i, c_i;
+
+    Real ee;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        a_f = epsilon[ie_f][0]; //a
+        b_f = epsilon[ie_f][1]; //b
+        c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        a_i = epsilon[ie_i][0]; //a'
+        b_i = epsilon[ie_i][1]; //b'
+        c_i = epsilon[ie_i][2]; //c'
+
+        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+        for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto adjD4_g_D2_Gi_ab_aa        = adjD4_g_D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
+            auto Gf_D3_ab_bb                = Gf_D3             ()(alpha_f,beta_i)(b_f,b_i);
+            auto Gf_adjD4_g_D2_Gi_ab_ba     = Gf_adjD4_g_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
+            auto Dq3_spec_ab_ab             = Dq3_spec          ()(alpha_f,beta_i)(a_f,b_i);
+
+            for (int gamma_i=0; gamma_i<Ns; gamma_i++){ 
+                auto ee_Dq1_spec_ag_ac      = ee * Dq1_spec     ()(alpha_f,gamma_i)(a_f,c_i);
+                auto ee_Gf_D1_ag_bc         = ee * Gf_D1        ()(alpha_f,gamma_i)(b_f,c_i);
+            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
+                auto ee_Dq1_spec_gg_cc      = ee * Dq1_spec     ()(gamma_f,gamma_i)(c_f,c_i);
+                auto Dq3_spec_gb_cb         = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
+                auto adjD4_g_D2_Gi_gb_ca    = adjD4_g_D2_Gi     ()(gamma_f,beta_i)(c_f,a_i);
+
+                if(wick_contraction == 1) { // Do contraction II1
+                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
+                                                        * adjD4_g_D2_Gi_ab_aa
+                                                        * Gf_D3_ab_bb;
+                }
+                if(wick_contraction == 2) { // Do contraction II2
+                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
+                                                        * Gf_adjD4_g_D2_Gi_ab_ba
+                                                        * Dq3_spec_gb_cb;
+                }
+                if(wick_contraction == 3) { // Do contraction II3
+                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
+                                                        * adjD4_g_D2_Gi_gb_ca
+                                                        * Dq3_spec_ab_ab;
+                }
+                if(wick_contraction == 4) { // Do contraction II4
+                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
+                                                        * Gf_adjD4_g_D2_Gi_ab_ba
+                                                        * Dq3_spec_ab_ab;
+                }
+                if(wick_contraction == 5) { // Do contraction II5
+                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
+                                                        * adjD4_g_D2_Gi_ab_aa
+                                                        * Dq3_spec_gb_cb;
+                }
+                if(wick_contraction == 6) { // Do contraction II6
+                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
+                                                        * adjD4_g_D2_Gi_gb_ca
+                                                        * Gf_D3_ab_bb;
+                }
+            }}
+        }}
+    }}
+}
+
+/* Dq1_spec is a quark line from t_i to t_f
+ * Dq2_spec is a quark line from t_i to t_f
+ * Dq3_ti is a quark line from t_i to t_J
+ * Dq4_tf is a quark line from t_f to t_J */
+template<class FImpl>
+template <class mobj, class mobj2, class robj>
+void BaryonUtils<FImpl>::Baryon_Gamma_3pt_Group3_Site(
+                        const mobj2 &Dq1_spec,
+                        const mobj2 &Dq2_spec,
+                        const mobj &Dq3_ti,
+                        const mobj &Dq4_tf,
+                                const Gamma GammaJ,
+                                const Gamma GammaBi,
+                                const Gamma GammaBf,
+                        int wick_contraction,
+                        robj &result)
+{
+    Gamma g5(Gamma::Algebra::Gamma5);
+
+    auto adjD4_g_D3     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti;
+    auto Gf_adjD4_g_D3  = GammaBf * adjD4_g_D3;
+    auto Gf_D1          = GammaBf * Dq1_spec;
+    auto D2_Gi          = Dq2_spec * GammaBi;
+    auto Gf_D2_Gi       = GammaBf * D2_Gi;
+
+    int a_f, b_f, c_f;
+    int a_i, b_i, c_i;
+
+    Real ee;
+
+    for (int ie_f=0; ie_f < 6 ; ie_f++){
+        a_f = epsilon[ie_f][0]; //a
+        b_f = epsilon[ie_f][1]; //b
+        c_f = epsilon[ie_f][2]; //c
+    for (int ie_i=0; ie_i < 6 ; ie_i++){
+        a_i = epsilon[ie_i][0]; //a'
+        b_i = epsilon[ie_i][1]; //b'
+        c_i = epsilon[ie_i][2]; //c'
+
+        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+        for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto D2_Gi_ab_aa            = D2_Gi         ()(alpha_f,beta_i)(a_f,a_i);
+            auto Gf_adjD4_g_D3_ab_bb    = Gf_adjD4_g_D3 ()(alpha_f,beta_i)(b_f,b_i);
+            auto Gf_D2_Gi_ab_ba         = Gf_D2_Gi      ()(alpha_f,beta_i)(b_f,a_i);
+            auto adjD4_g_D3_ab_ab       = adjD4_g_D3    ()(alpha_f,beta_i)(a_f,b_i);
+
+            for (int gamma_i=0; gamma_i<Ns; gamma_i++) {
+                auto ee_Dq1_spec_ag_ac  = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
+                auto ee_Gf_D1_ag_bc     = ee * Gf_D1    ()(alpha_f,gamma_i)(b_f,c_i);
+            for (int gamma_f=0; gamma_f<Ns; gamma_f++) {
+                auto ee_Dq1_spec_gg_cc  = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
+                auto adjD4_g_D3_gb_cb   = adjD4_g_D3    ()(gamma_f,beta_i)(c_f,b_i);
+                auto D2_Gi_gb_ca        = D2_Gi         ()(gamma_f,beta_i)(c_f,a_i);
+
+                if(wick_contraction == 1) { // Do contraction III1
+                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
+                                                        * D2_Gi_ab_aa
+                                                        * Gf_adjD4_g_D3_ab_bb;
+                }
+                if(wick_contraction == 2) { // Do contraction III2
+                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
+                                                        * Gf_D2_Gi_ab_ba
+                                                        * adjD4_g_D3_gb_cb;
+                }
+                if(wick_contraction == 3) { // Do contraction III3
+                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
+                                                        * D2_Gi_gb_ca
+                                                        * adjD4_g_D3_ab_ab;
+                }
+                if(wick_contraction == 4) { // Do contraction III4
+                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
+                                                        * Gf_D2_Gi_ab_ba
+                                                        * adjD4_g_D3_ab_ab;
+                }
+                if(wick_contraction == 5) { // Do contraction III5
+                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
+                                                        * D2_Gi_ab_aa
+                                                        * adjD4_g_D3_gb_cb;
+                }
+                if(wick_contraction == 6) { // Do contraction III6
+                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
+                                                        * D2_Gi_gb_ca
+                                                        * Gf_adjD4_g_D3_ab_bb;
+                }
+            }}
+        }}
+    }}
+}
+
+/* The group indicates which inital state quarks the current is  * 
+ * connected to. It must be in the range 1-3.                    *
+ * The wick_contraction must be in the range 1-6 correspond to   *
+ * the contractions given in the Hadrons documentation at        *
+ * https://aportelli.github.io/Hadrons-doc/#/mcontraction        */
+template<class FImpl>
+template <class mobj>
+void BaryonUtils<FImpl>::Baryon_Gamma_3pt(
+                        const PropagatorField &q_ti,
+                        const mobj &Dq_spec1,
+                        const mobj &Dq_spec2,
+                        const PropagatorField &q_tf,
+                        int group,
+                        int wick_contraction,
+                                const Gamma GammaJ,
+                                const Gamma GammaBi,
+                                const Gamma GammaBf,
+                        SpinMatrixField &stn_corr)
+{
+    GridBase *grid = q_tf.Grid();
+
+    autoView( vcorr, stn_corr, CpuWrite);
+    autoView( vq_ti , q_ti, CpuRead);
+    autoView( vq_tf , q_tf, CpuRead);
+
+    if (group == 1) {
+        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+            auto Dq_ti = vq_ti[ss];
+            auto Dq_tf = vq_tf[ss];
+            sobj result=Zero();
+            Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            vcorr[ss] += result; 
+        });//end loop over lattice sites
+    } else if (group == 2) {
+        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+            auto Dq_ti = vq_ti[ss];
+            auto Dq_tf = vq_tf[ss];
+            sobj result=Zero();
+            Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+            vcorr[ss] += result; 
+        });//end loop over lattice sites
+    } else if (group == 3) {
+        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+            auto Dq_ti = vq_ti[ss];
+            auto Dq_tf = vq_tf[ss];
+            sobj result=Zero();
+            Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+
+            vcorr[ss] += result; 
+        });//end loop over lattice sites
+    }
+}
+
+
+/***********************************************************************
+ * End of BaryonGamma3pt-function code.                                *
+ *																	   *
  * The following code is for Sigma -> N rare hypeon decays             *
  **********************************************************************/
 
diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h
index dbcbae8d..36becc49 100644
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -59,6 +59,20 @@ class GridTensorBase {};
   using DoublePrecision2= typename Traits::DoublePrecision2; \
   static constexpr int TensorLevel = Traits::TensorLevel
 
+///////////////////////////////////////////////////////////
+// Allows to turn scalar<scalar<scalar<double>>>> back to double.
+///////////////////////////////////////////////////////////
+template <class T>
+accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
+TensorRemove(T arg) {
+  return arg;
+}
+template <class vtype>
+accelerator_inline auto TensorRemove(iScalar<vtype> arg)
+  -> decltype(TensorRemove(arg._internal)) {
+  return TensorRemove(arg._internal);
+}
+
 template <class vtype>
 class iScalar {
 public:
@@ -135,9 +149,10 @@ public:
   operator ComplexD() const {
     return (TensorRemove(_internal));
   }
+  //             instantiation of "Grid::iScalar<vtype>::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, <unnamed>=0, <unnamed>=0U]" 
   template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
   operator RealD() const {
-    return TensorRemove(_internal);
+    return (RealD) TensorRemove(_internal);
   }
   template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
   operator Integer() const {
@@ -169,20 +184,6 @@ public:
   strong_inline       scalar_type * end()         { return begin() + Traits::count; }
 };
 
-///////////////////////////////////////////////////////////
-// Allows to turn scalar<scalar<scalar<double>>>> back to double.
-///////////////////////////////////////////////////////////
-template <class T>
-accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
-TensorRemove(T arg) {
-  return arg;
-}
-template <class vtype>
-accelerator_inline auto TensorRemove(iScalar<vtype> arg)
-  -> decltype(TensorRemove(arg._internal)) {
-  return TensorRemove(arg._internal);
-}
-
 template <class vtype, int N>
 class iVector {
 public:
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 1cb6d637..74a3ea22 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -28,6 +28,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once
 
+#include <string.h>
+
 #ifdef HAVE_MALLOC_MALLOC_H
 #include <malloc/malloc.h>
 #endif
@@ -334,12 +336,11 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc
 //////////////////////////////////////////////
 // CPU Target - No accelerator just thread instead
 //////////////////////////////////////////////
+#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
 #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
 
 #undef GRID_SIMT
 
-#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned 
-
 #define accelerator 
 #define accelerator_inline strong_inline
 #define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
@@ -365,6 +366,14 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 
 #endif // CPU target
 
+#ifdef HAVE_MM_MALLOC_H
+inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
+inline void acceleratorFreeCpu  (void *ptr){_mm_free(ptr);};
+#else
+inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
+inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
+#endif
+
 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
 ///////////////////////////////////////////////////
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index e93f3046..656e29a9 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -318,6 +318,11 @@ void Grid_init(int *argc,char ***argv)
     Grid_debug_handler_init();
   }
 
+  //////////////////////////////////////////////////////////
+  // Memory manager
+  //////////////////////////////////////////////////////////
+  MemoryManager::Init();
+
   //////////////////////////////////////////////////////////
   // MPI initialisation
   //////////////////////////////////////////////////////////
@@ -357,11 +362,6 @@ void Grid_init(int *argc,char ***argv)
   std::cout << GridLogMessage << "================================================ "<<std::endl;
 
 
-  //////////////////////////////////////////////////////////
-  // Memory manager
-  //////////////////////////////////////////////////////////
-  MemoryManager::Init();
-
   /////////////////////////////////////////////////////////
   // Reporting
   /////////////////////////////////////////////////////////
diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc
new file mode 100644
index 00000000..9e11c160
--- /dev/null
+++ b/tests/solver/Test_dwf_multigrid.cc
@@ -0,0 +1,594 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _Solver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(LinearOperatorBase<Field> &Matrix,
+	      OperatorFunction<Field> &Solver,
+	      LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Guess(in,out);
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+
+// Must use a non-hermitian solver
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+
+  HDCRPreconditioner(Aggregates &Agg,
+		     FineOperator &Fine,
+		     FineSmoother &Smoother,
+		     CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _Smoother(Smoother),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _Aggregates.CoarseGrid;
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+/*
+template<class Fobj,class CComplex,int nbasis, class Guesser, class CoarseSolver>
+class MultiGridPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  CoarseOperator & _CoarseOperator;
+  FineOperator   & _FineOperator;
+  Guesser        & _Guess;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+
+  MultiGridPreconditioner(Aggregates &Agg, CoarseOperator &Coarse, 
+			  FineOperator &Fine,
+			  FineSmoother &Smoother,
+			  Guesser &Guess_,
+			  CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _CoarseOperator(Coarse),
+      _FineOperator(Fine),
+      _Smoother(Smoother),
+      _Guess(Guess_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    CoarseVector Csrc(_CoarseOperator.Grid());
+    CoarseVector Csol(_CoarseOperator.Grid()); 
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+*/
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  std::vector<int> blockc ({2,2,2,2});
+  const int nbasis= 32;
+  const int nbasisc= 32;
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+  auto cclatt = clatt;
+  for(int d=0;d<clatt.size();d++){
+    cclatt[d] = clatt[d]/blockc[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+  GridCartesian *CoarseCoarse4d =  SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *CoarseCoarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);// src=src+g5*src;
+  LatticeFermion result(FGrid); 
+  LatticeGaugeField Umu(UGrid); 
+
+  FieldMetaData header;
+  std::string file("./ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.001;
+  RealD M5=1.8;
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermDefOp(Ddwf);
+
+  Subspace Aggregates(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  {
+    int nb=nbasis/2;
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,500,100,100,0.0);
+    for(int n=0;n<nb;n++){
+      G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
+    }
+    LatticeFermion A(FGrid);
+    LatticeFermion B(FGrid);
+    for(int n=0;n<nb;n++){
+      A = Aggregates.subspace[n];
+      B = Aggregates.subspace[n+nb];
+      Aggregates.subspace[n]   = A+B; // 1+G5 // eigen value of G5R5 is +1
+      Aggregates.subspace[n+nb]= A-B; // 1-G5 // eigen value of G5R5 is -1
+    }
+  }
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Will coarsen G5R5 M and G5R5 Mpv in G5R5 compatible way " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+  typedef CoarsenedMatrix<siteVector,iScalar<vTComplex>,nbasisc> Level2Op;
+
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOp(Ddwf);
+  Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> HermIndefOpPV(Dpv);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building coarse representation of Indef operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  Level1Op LDOp(*Coarse5d,1);   LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates);
+  Level1Op LDOpPV(*Coarse5d,1); LDOpPV.CoarsenOperator(FGrid,HermIndefOpPV,Aggregates);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Testing fine and coarse solvers " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d);
+
+  LatticeFermion f_src(FGrid); f_src=1.0;
+  LatticeFermion f_res(FGrid);
+
+  LatticeFermion f_src_e(FrbGrid); f_src_e=1.0;
+  LatticeFermion f_res_e(FrbGrid);
+
+  RealD tol=1.0e-8;
+  int MaxIt = 10000;
+
+  BiCGSTAB<CoarseVector>                     CoarseBiCGSTAB(tol,MaxIt);
+  ConjugateGradient<CoarseVector>            CoarseCG(tol,MaxIt);
+  //  GeneralisedMinimalResidual<CoarseVector>   CoarseGMRES(tol,MaxIt,20);
+
+  BiCGSTAB<LatticeFermion>                   FineBiCGSTAB(tol,MaxIt);
+  ConjugateGradient<LatticeFermion>          FineCG(tol,MaxIt);
+  //  GeneralisedMinimalResidual<LatticeFermion> FineGMRES(tol,MaxIt,20);
+  
+  MdagMLinearOperator<DomainWallFermionR,LatticeFermion>    FineMdagM(Ddwf);     //  M^\dag M
+  PVdagMLinearOperator<DomainWallFermionR,LatticeFermion>   FinePVdagM(Ddwf,Dpv);//  M_{pv}^\dag M
+  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> FineDiagMooee(Ddwf); //  M_ee - Meo Moo^-1 Moe 
+  SchurDiagOneOperator<DomainWallFermionR,LatticeFermion>   FineDiagOne(Ddwf);   //  1 - M_ee^{-1} Meo Moo^{-1} Moe e
+
+   MdagMLinearOperator<Level1Op,CoarseVector> CoarseMdagM(LDOp);
+  PVdagMLinearOperator<Level1Op,CoarseVector> CoarsePVdagM(LDOp,LDOpPV);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Fine CG unprec "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  f_res=Zero();
+  FineCG(FineMdagM,f_src,f_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Fine CG prec DiagMooee "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  f_res_e=Zero();
+  FineCG(FineDiagMooee,f_src_e,f_res_e);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Fine CG prec DiagOne "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  f_res_e=Zero();
+  FineCG(FineDiagOne,f_src_e,f_res_e);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Fine BiCGSTAB unprec "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  f_res=Zero();
+  FineBiCGSTAB(FinePVdagM,f_src,f_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Coarse BiCGSTAB "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  c_res=Zero();
+  CoarseBiCGSTAB(CoarsePVdagM,c_src,c_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Coarse CG unprec "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  c_res=Zero();
+  CoarseCG(CoarseMdagM,c_src,c_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Coarse grid Lanczos "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  Chebyshev<CoarseVector>      IRLCheby(0.03,12.0,71);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseMdagM);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseMdagM);
+  int Nk=64;
+  int Nm=128;
+  int Nstop=Nk;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  IRL.calc(eval,evec,c_src,Nconv);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Coarse grid deflated solver              "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+  NormalEquations<CoarseVector> DeflCoarseCGNE   (LDOp,CoarseCG,DeflCoarseGuesser);
+  c_res=Zero();
+  DeflCoarseCGNE(c_src,c_res);
+
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running HDCR                                     "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  ConjugateGradient<CoarseVector>  CoarseMgridCG(0.001,1000);     
+  ChebyshevSmoother<LatticeFermion,DomainWallFermionR> FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf);
+
+  typedef HDCRPreconditioner<vSpinColourVector,  vTComplex,nbasis, NormalEquations<CoarseVector> >   TwoLevelHDCR;
+  TwoLevelHDCR TwoLevelPrecon(Aggregates,
+			      HermIndefOp,
+			      FineSmoother,
+			      DeflCoarseCGNE);
+  TwoLevelPrecon.Level(1);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> l1PGCR(1.0e-8,100,HermIndefOp,TwoLevelPrecon,16,16);
+  l1PGCR.Level(1);
+
+  f_res=Zero();
+
+  CoarseCG.Tolerance=0.02;
+  l1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Running Multigrid                                "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  BiCGSTAB<CoarseVector>    CoarseMgridBiCGSTAB(0.01,1000);     
+  BiCGSTAB<LatticeFermion>  FineMgridBiCGSTAB(0.0,24);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  ZeroGuesser<LatticeFermion> FineZeroGuesser;
+
+  SolverWrapper<LatticeFermion> FineBiCGSmoother(  FinePVdagM,  FineMgridBiCGSTAB,  FineZeroGuesser);
+  SolverWrapper<CoarseVector> CoarsePVdagMSolver(CoarsePVdagM,CoarseMgridBiCGSTAB,CoarseZeroGuesser);
+  typedef HDCRPreconditioner<vSpinColourVector, vTComplex,nbasis, SolverWrapper<CoarseVector> >  TwoLevelMG;
+
+  TwoLevelMG _TwoLevelMG(Aggregates,
+			 FinePVdagM,
+			 FineBiCGSmoother,
+			 CoarsePVdagMSolver);
+  _TwoLevelMG.Level(1);
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> pvPGCR(1.0e-8,100,FinePVdagM,_TwoLevelMG,16,16);
+  pvPGCR.Level(1);
+
+  f_res=Zero();
+  pvPGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+  
+}
diff --git a/tests/solver/Test_hw_multigrid.cc b/tests/solver/Test_hw_multigrid.cc
new file mode 100644
index 00000000..b728faa7
--- /dev/null
+++ b/tests/solver/Test_hw_multigrid.cc
@@ -0,0 +1,375 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_hdcr.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+//#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+/* Params
+ * Grid: 
+ * block1(4)
+ * block2(4)
+ * 
+ * Subspace
+ * * Fine  : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100
+ * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100
+
+ * Smoother:
+ * * Fine: Cheby(hi, lo, order)            --  60,0.5,10
+ * * Coarse: Cheby(hi, lo, order)          --  12,0.1,4
+
+ * Lanczos:
+ * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order))   24,36,24,0.002,4.0,61 
+ */
+
+template<class Field> class SolverWrapper : public LinearFunction<Field> {
+private:
+  LinearOperatorBase<Field> & _Matrix;
+  OperatorFunction<Field> & _Solver;
+  LinearFunction<Field>   & _Guess;
+public:
+
+  /////////////////////////////////////////////////////
+  // Wrap the usual normal equations trick
+  /////////////////////////////////////////////////////
+  SolverWrapper(LinearOperatorBase<Field> &Matrix,
+	      OperatorFunction<Field> &Solver,
+	      LinearFunction<Field> &Guess) 
+   :  _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; 
+
+  void operator() (const Field &in, Field &out){
+ 
+    _Guess(in,out);
+    _Solver(_Matrix,in,out);  // Mdag M out = Mdag in
+
+  }     
+};
+
+
+// Must use a non-hermitian solver
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & _SmootherMatrix;
+  FineOperator   & _SmootherOperator;
+  
+  Chebyshev<Field> Cheby;
+
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) :
+    _SmootherOperator(SmootherOperator),
+    _SmootherMatrix(SmootherMatrix),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    MdagMLinearOperator<Matrix,Field>   MdagMOp(_SmootherMatrix); 
+    _SmootherOperator.AdjOp(in,tmp);
+    Cheby(MdagMOp,tmp,out);         
+  }
+};
+
+template<class Field,class Matrix> class MirsSmoother : public LinearFunction<Field>
+{
+public:
+  typedef LinearOperatorBase<Field>                            FineOperator;
+  Matrix         & SmootherMatrix;
+  FineOperator   & SmootherOperator;
+  RealD tol;
+  RealD shift;
+  int   maxit;
+
+  MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) :
+    shift(_shift),tol(_tol),maxit(_maxit),
+    SmootherOperator(_SmootherOperator),
+    SmootherMatrix(_SmootherMatrix)
+  {};
+
+  void operator() (const Field &in, Field &out) 
+  {
+    ZeroGuesser<Field> Guess;
+    ConjugateGradient<Field>  CG(tol,maxit,false);
+ 
+    Field src(in.Grid());
+
+    ShiftedMdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(SmootherMatrix,shift);
+    SmootherOperator.AdjOp(in,src);
+    Guess(src,out);
+    CG(MdagMOp,src,out); 
+  }
+};
+
+#define GridLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level <<" "
+
+template<class Fobj,class CComplex,int nbasis, class CoarseSolver>
+class HDCRPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef CoarsenedMatrix<Fobj,CComplex,nbasis> CoarseOperator;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _Smoother;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+
+  HDCRPreconditioner(Aggregates &Agg,
+		     FineOperator &Fine,
+		     FineSmoother &Smoother,
+		     CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _Smoother(Smoother),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    auto CoarseGrid = _Aggregates.CoarseGrid;
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    double t;
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(in,out);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    GridLogLevel << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    _CoarseSolve(Csrc,Csol);
+    t+=usecond();
+    GridLogLevel << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    GridLogLevel << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+
+    // Fine Smoother
+    t=-usecond();
+    _Smoother(vec1,vec2);
+    t+=usecond();
+    GridLogLevel << "Smoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  ///////////////////////////////////////////////////
+  // Construct a coarsened grid; utility for this?
+  ///////////////////////////////////////////////////
+  std::vector<int> block ({2,2,2,2});
+  const int nbasis= 8;
+
+  auto clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(Ls,Coarse4d);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeField Umu(UGrid); 
+  FieldMetaData header;
+  std::string file("./ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Building g5R5 hermitian DWF operator" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  RealD mass=0.001;
+  RealD M5=1.8;
+  WilsonFermionR    Dw(Umu,*UGrid,*UrbGrid,-M5);
+  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionR Dpv (Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
+  typedef CoarseOperator::CoarseVector                                 CoarseVector;
+  typedef CoarseOperator::siteVector siteVector;
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Calling Aggregation class to build subspace" <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  MdagMLinearOperator<WilsonFermionR,LatticeFermion> SubspaceOp(Dw);
+
+  Subspace Aggregates4D(Coarse4d,UGrid,0);
+  Subspace Aggregates5D(Coarse5d,FGrid,0);
+
+  assert ( (nbasis & 0x1)==0);
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " 4D subspace build                                " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  int nb=nbasis/2;
+  Gamma g5(Gamma::Algebra::Gamma5);
+  Aggregates4D.CreateSubspaceChebyshev(RNG4,SubspaceOp,nb,60.0,0.02,500,100,100,0.0);
+  for(int n=0;n<nb;n++){
+    Aggregates4D.subspace[n+nb]= Aggregates4D.subspace[n] - g5 * Aggregates4D.subspace[n];
+    Aggregates4D.subspace[n]   = Aggregates4D.subspace[n] + g5 * Aggregates4D.subspace[n];
+  }
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Promote to 5D basis                              " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  for(int n=0;n<nbasis;n++){
+    for(int s=0;s<Ls;s++){
+      InsertSlice(Aggregates4D.subspace[n],Aggregates5D.subspace[n],s,0);
+    }
+  }
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Coarsen the operator                          " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>    Level1Op;
+
+  NonHermitianLinearOperator<DomainWallFermionR,LatticeFermion>  LinOpDwf(Ddwf);
+
+  Level1Op LDOp  (*Coarse5d,0);   
+  
+  std::cout<<GridLogMessage << " Callinig Coarsen the operator                          " <<std::endl;
+  LDOp.CoarsenOperator(FGrid,LinOpDwf,Aggregates5D);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Coarse CG unprec "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d);
+
+  LatticeFermion f_src(FGrid); f_src=1.0;
+  LatticeFermion f_res(FGrid);
+
+  RealD tol=1.0e-8;
+  int MaxIt = 10000;
+
+  MdagMLinearOperator<Level1Op,CoarseVector> CoarseMdagM(LDOp);
+  BiCGSTAB<CoarseVector>                     CoarseBiCGSTAB(tol,MaxIt);
+  ConjugateGradient<CoarseVector>            CoarseCG(tol,MaxIt);
+
+  c_res=Zero();
+  CoarseCG(CoarseMdagM,c_src,c_res);
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << " Solve                                            " <<std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  
+  LatticeFermion    src(FGrid); gaussian(RNG5,src);
+  LatticeFermion result(FGrid); 
+
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
+  Grid_finalize();
+  
+}