Merge pull request #4 from paboyle/develop

merge
2026-05-21 17:44:16 +01:00 · 2020-05-11 20:59:29 +02:00
parent 3c6ffcb48c ea08f193e7
commit b1c86900b2
16 changed files with 582 additions and 452 deletions
@@ -257,13 +257,11 @@ public:
      virtual  RealD Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
      tmp.Checkerboard() = !in.Checkerboard();
-	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;

 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);

-      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
@@ -366,6 +364,9 @@ public:
        void OpDir(const Field& in, Field& out, int dir, int disp) {
          assert(0);
        }
+        void OpDirAll(const Field& in, std::vector<Field>& out){
+          assert(0);
+        };
    };

    template<class Matrix, class Field>
@@ -234,10 +234,8 @@ public:

    GridBase *grid=in.Grid();

-    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
-    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
-
    int vol=grid->gSites();
+    typedef typename Field::vector_type vector_type;

    Field T0(grid); T0 = in;  
    Field T1(grid); 
@@ -260,12 +258,26 @@ public:
    for(int n=2;n<order;n++){

      Linop.HermOp(*Tn,y);
-      //     y=xscale*y+mscale*(*Tn);
-      //      *Tnp=2.0*y-(*Tnm);
-      //      out=out+Coeffs[n]* (*Tnp);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
-      axpy(out,Coeffs[n],*Tnp,out);
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
@@ -6,21 +6,39 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;

-#ifdef GRID_NVCC
-#define SMALL_LIMIT (0)
+int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
+#ifdef GRID_CUDA
+int PointerCache::Ncache      = 32;
 #else 
-#define SMALL_LIMIT (4096)
+int PointerCache::Ncache      = 8;
 #endif
+int PointerCache::Victim;
+int PointerCache::VictimSmall;
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];

-#ifdef POINTER_CACHE
-int PointerCache::victim;
+void PointerCache::Init(void)
+{
+  char * str;

-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) Ncache = atoi(str);
+  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;

-void *PointerCache::Insert(void *ptr,size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return ptr;
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) NcacheSmall = atoi(str);
+  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;

+  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
+}
+void *PointerCache::Insert(void *ptr,size_t bytes) 
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
+  return Insert(ptr,bytes,Entries,Ncache,Victim);  
+}
+void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
+{
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -28,8 +46,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
  void * ret = NULL;
  int v = -1;

-  for(int e=0;e<Ncache;e++) {
-    if ( Entries[e].valid==0 ) {
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
      v=e; 
      break;
    }
@@ -37,40 +55,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {

  if ( v==-1 ) {
    v=victim;
-    victim = (victim+1)%Ncache;
+    victim = (victim+1)%ncache;
  }

-  if ( Entries[v].valid ) {
-    ret = Entries[v].address;
-    Entries[v].valid = 0;
-    Entries[v].address = NULL;
-    Entries[v].bytes = 0;
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
  }

-  Entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;

  return ret;
 }

-void *PointerCache::Lookup(size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return NULL;
-
+void *PointerCache::Lookup(size_t bytes)
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Lookup(bytes,EntriesSmall,NcacheSmall);
+  return Lookup(bytes,Entries,Ncache);
+}
+void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
+{
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
-
-  for(int e=0;e<Ncache;e++){
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
-      Entries[e].valid = 0;
-      return Entries[e].address;
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
    }
  }
  return NULL;
 }
-#endif
+

 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
@@ -42,21 +42,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)

 NAMESPACE_BEGIN(Grid);

 // Move control to configure.ac and Config.h?
-#ifdef POINTER_CACHE
+
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
-#ifdef GRID_NVCC 
-  static const int Ncache=128;
-#else
-  static const int Ncache=8;
-#endif
-  static int victim;
+/* Could make these configurable, perhaps up to a max size*/
+  static const int NcacheSmallMax=128; 
+  static const int NcacheMax=16;
+  static int NcacheSmall;
+  static int Ncache;

  typedef struct { 
    void *address;
@@ -64,15 +64,18 @@ private:
    int valid;
  } PointerCacheEntry;
    
-  static PointerCacheEntry Entries[Ncache];
+  static PointerCacheEntry Entries[NcacheMax];
+  static int Victim;
+  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
+  static int VictimSmall;

 public:
-
+  static void Init(void);
  static void *Insert(void *ptr,size_t bytes) ;
+  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes) ;
-
+  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
-#endif  

 std::string sizeString(size_t bytes);

@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
    assert(heap_bytes<heap_size);
  }
  //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
@@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);

 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){

 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard() = lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -110,15 +110,15 @@ public:
 #endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
-  GridTime Elapsed(void) {
+  GridTime Elapsed(void) const {
    assert(running == false);
    return std::chrono::duration_cast<GridTime>( accumulator );
  }
-  uint64_t useconds(void){
+  uint64_t useconds(void) const {
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
-  bool isRunning(void){
+  bool isRunning(void) const {
    return running;
  }
 };
@@ -59,7 +59,7 @@ public:
  {
    RealD eps = 1.0;

-    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
    assert(zdata->n==this->Ls);
 	
@@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);

-  int tshift = (mu == Nd-1) ? 1 : 0;

 #if 0
+  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // SHAMIR CASE 
  ////////////////////////////////////////////////
@@ -829,6 +829,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #endif

 #ifndef GRID_NVCC
+  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
@@ -159,6 +159,7 @@ const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 						    Complex(-1),
 						    Complex(-1)};

+//This is the old version
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
@@ -180,6 +181,10 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
    auto gD3 = GammaB_right * D3;

+    auto D2g = D2 * GammaB_left;
+    auto pD1g = pD1 * GammaB_left;
+    auto gD3g = gD3 * GammaB_left;
+
    for (int ie_left=0; ie_left < 6 ; ie_left++){
      int a_left = epsilon[ie_left][0]; //a
      int b_left = epsilon[ie_left][1]; //b
@@ -188,58 +193,71 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
        int a_right = epsilon[ie_right][0]; //a'
        int b_right = epsilon[ie_right][1]; //b'
        int c_right = epsilon[ie_right][2]; //c'
+	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
        //This is the \delta_{456}^{123} part
 	if (wick_contraction[0]){
-          auto D2g = D2 * GammaB_left;
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	        result()()() += eepD1*D2g_ab*gD3_ab;
          }}}
  	}	  
        //This is the \delta_{456}^{231} part
 	if (wick_contraction[1]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+		result()()() += eepD1g_gb*D2_ab*gD3_ag;
          }}}
        }	  
        //This is the \delta_{456}^{312} part
 	if (wick_contraction[2]){
-          auto gD3g = gD3 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+		result()()() += eepD1_gb*D2_ag*gD3g_ab;
          }}}
        }	  
        //This is the \delta_{456}^{132} part
 	if (wick_contraction[3]){
-          auto gD3g = gD3 * GammaB_left;
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
          }}}
        }	  
        //This is the \delta_{456}^{321} part
 	if (wick_contraction[4]){
-          auto D2g = D2 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
          }}}
        }	  
        //This is the \delta_{456}^{213} part
 	if (wick_contraction[5]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
          }}}
        }	  
      }
@@ -259,6 +277,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -305,6 +327,10 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 const int parity,
 						 robj &result)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -318,7 +344,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;

     result=Zero();
-     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }

 /***********************************************************************
@@ -558,6 +584,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
  GridBase *grid = qs_ti.Grid();

  auto vcorr= stn_corr.View();
@@ -595,6 +625,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
  GridBase *grid = qs_ti.Grid();

  auto vcorr= stn_corr.View();
@@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  GridGpuInit(); // Must come first to set device prior to MPI init

+  PointerCache::Init();
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
@@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
+void GridCmdOptionInt(std::string &str,int & val);


 void GridParseLayout(char **argv,int argc,
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace Grid;

-
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
@@ -76,7 +75,6 @@ struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
-  //  int HugePages;
 };

 class Benchmark {
@@ -119,14 +117,15 @@ public:
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();

-    for(int lat=4;lat<=maxlat;lat+=4){
-      for(int Ls=8;Ls<=8;Ls*=2){
+    for(int lat=16;lat<=maxlat;lat+=8){
+      //      for(int Ls=8;Ls<=8;Ls*=2){
+      { int Ls=12;

 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-
+	std::cout << GridLogMessage<< latt_size <<std::endl;
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -184,9 +183,6 @@ public:
 	}

 	timestat.statistics(t_time);
-	//	for(int i=0;i<t_time.size();i++){
-	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
-	//	}

 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
@@ -200,8 +196,6 @@ public:
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	
- 
-	
 	    }
    }    

@@ -227,14 +221,15 @@ public:
    uint64_t NN;


-  uint64_t lmax=48;
+  uint64_t lmax=32;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=4){
+    for(int lat=8;lat<=lmax;lat+=8){

      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

      //      NP= Grid.RankCount();
@@ -270,191 +265,8 @@ public:
    }
  };

-#if 0
-  static double DWF5(int Ls,int L)
-  {
-    //    RealD mass=0.1;
-    RealD M5  =1.8;

-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
-    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    std::vector<int> seeds5({5,6,7,8});
-    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    ///////// Source preparation ////////////
-    LatticeFermion src   (sFGrid); 
-    LatticeFermion tmp   (sFGrid);
-    std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
-    random(RNG5,src);
-    std::cout << GridLogMessage << "intialised random source" << std::endl;
-
-    RealD N2 = 1.0/::sqrt(norm2(src));
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-
-    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-    LatticeFermion src_e (sFrbGrid);
-    LatticeFermion src_o (sFrbGrid);
-    LatticeFermion r_e   (sFrbGrid);
-    LatticeFermion r_o   (sFrbGrid);
-    LatticeFermion r_eo  (sFGrid);
-    LatticeFermion err   (sFGrid);
-    {
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd,src_o,src);
-
-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
-      const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
-      controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-
-	 WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	 WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-	int nwarm = 100;
-	uint64_t ncall = 1000;
-
-	double t0=usecond();
-	sFGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	}
-	sFGrid->Barrier();
-	double t1=usecond();
-
-	sDw.ZeroCounters();
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	sFGrid->Barrier();
-	
-	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
-	double mf_hi, mf_lo, mf_err;
-
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
-
-	sDw.Report();
-
-      }
-      double robust = mflops_worst/mflops_best;;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-
-      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    }
-    return mflops_best;
-  }
-#endif
-
-  static double DWF(int Ls,int L, double & robust)
+  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
@@ -471,37 +283,30 @@ public:
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});

-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;

-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});

    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

-
    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -514,74 +319,31 @@ public:
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

+    typedef DomainWallFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+    
    ///////// Source preparation ////////////
-    LatticeFermion src   (FGrid); random(RNG5,src);
-    LatticeFermion ref   (FGrid);
-    LatticeFermion tmp   (FGrid);
+    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Fermion src   (FGrid); random(RNG5,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-    RealD N2 = 1.0/::sqrt(norm2(src));
-    std::cout<<GridLogMessage << "Normalising src  "<< N2 <<std::endl;
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-    
-
-    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    ////////////////////////////////////
-    // Naive wilson implementation
-    ////////////////////////////////////
-    {
-      LatticeGaugeField Umu5d(FGrid); 
-      std::vector<LatticeColourMatrix> U(4,FGrid);
-      auto Umu_v = Umu.View();
-      auto Umu5d_v = Umu5d.View();
-      for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  Umu5d_v[Ls*ss+s] = Umu_v[ss];
-	}
-      }
-      ref = Zero();
-      for(int mu=0;mu<Nd;mu++){
-	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
-      }
-      for(int mu=0;mu<Nd;mu++){
-	
-	tmp = U[mu]*Cshift(src,mu+1,1);
-	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-	
-	tmp =adj(U[mu])*src;
-	tmp =Cshift(tmp,mu+1,-1);
-	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      }
-      ref = -0.5*ref;
-    }
-
-    LatticeFermion src_e (FrbGrid);
-    LatticeFermion src_o (FrbGrid);
-    LatticeFermion r_e   (FrbGrid);
-    LatticeFermion r_o   (FrbGrid);
-    LatticeFermion r_eo  (FGrid);
-    LatticeFermion err   (FGrid);
    {

      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);

-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
      const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
      controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
@@ -594,15 +356,12 @@ public:

 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

-	int nwarm = 200;
+	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -610,9 +369,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
-	//	if (ncall < 500) ncall = 500;
-	uint64_t ncall = 1000;
+	uint64_t ncall = 50;

 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

@@ -649,24 +406,11 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;

-	Dw.Report();
-
-	Dw.DhopEO(src_o,r_e,DaggerNo);
-	Dw.DhopOE(src_e,r_o,DaggerNo);
-	setCheckerboard(r_eo,r_o);
-	setCheckerboard(r_eo,r_e);
-	err = r_eo-ref; 
-	RealD absref = norm2(ref);
-	RealD abserr = norm2(err);
-	std::cout<<GridLogMessage << "norm diff   "<< abserr << " / " << absref<<std::endl;
-	assert(abserr<1.0e-4);
-
      }
-      robust = mflops_worst/mflops_best;
+
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;

@@ -680,8 +424,166 @@ public:
    return mflops_best;
  }

+
+  static double Staggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	Ds.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1146.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
 };

+
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -696,62 +598,50 @@ int main (int argc, char ** argv)

  int do_memory=1;
  int do_comms =1;
-  int do_su3   =0;
-  int do_wilson=1;
-  int do_dwf   =1;

-  if ( do_su3 ) {
-    // empty for now
-  }
-#if 1
  int sel=2;
-  Coordinate L_list({8,12,16,24});
-#else
-  int sel=1;
-  Coordinate L_list({8,12});
-#endif
+  std::vector<int> L_list({16,24,32});
  int selm1=sel-1;
-  std::vector<double> robust_list;

  std::vector<double> wilson;
  std::vector<double> dwf4;
-  std::vector<double> dwf5;
+  std::vector<double> staggered;

-  if ( do_wilson ) {
-    int Ls=1;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
-    }
+  int Ls=1;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
  }

-  int Ls=16;
-  if ( do_dwf ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
-      dwf4.push_back(result);
-      robust_list.push_back(robust);
-    }
+  Ls=12;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::DWF(Ls,L_list[l]) ;
+    dwf4.push_back(result);
  }

-  if ( do_dwf ) {
+  /*
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::Staggered(L_list[l]) ;
+    staggered.push_back(result);
+  }
+  */

  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  }

  int NN=NN_global;
  if ( do_memory ) {
@@ -768,24 +658,20 @@ int main (int argc, char ** argv)
    Benchmark::Comms();
  }

-  if ( do_dwf ) {
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
-  }
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
+    for(int l=0;l<L_list.size();l++){
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
-  std::cout<<std::setprecision(3);
-  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
+    std::cout<<std::setprecision(3);
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

  Grid_finalize();
 }
@@ -0,0 +1,176 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+void benchDw(std::vector<int> & L, int Ls);
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  const int Ls=12;
+  std::vector< std::vector<int> > latts;
+#if 1
+  latts.push_back(std::vector<int> ({24,24,24,24}) );
+  latts.push_back(std::vector<int> ({48,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,48,24,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+#else
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+  latts.push_back(std::vector<int> ({96,96,96,192}) );
+#endif
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+
+  for (int l=0;l<latts.size();l++){
+    std::vector<int> latt4 = latts[l];
+    std::cout << GridLogMessage <<"\t";
+    for(int d=0;d<Nd;d++){
+      std::cout<<latt4[d]<<"x";
+    }
+    std::cout <<Ls<<"\t" ;
+    benchDw (latt4,Ls);
+  }
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  Grid_finalize();
+}
+
+
+void benchDw(std::vector<int> & latt4, int Ls)
+{
+  /////////////////////////////////////////////////////////////////////////////////////
+  // for Nc=3
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Dw :  Ls*24*(7+48)= Ls*1320 
+  //
+  // M5D:  Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
+  // Meo:  Ls*24*(7+48) + Ls*72 = Ls*1392 
+  //
+  // Mee:  3*Ns*2*Nc*Ls  // Chroma 6*N5*Nc*Ns 
+  //
+  // LeemInv : 2*2*Nc*madd*Ls
+  // LeeInv  : 2*2*Nc*madd*Ls
+  // DeeInv  : 4*2*Nc*mul *Ls
+  // UeeInv  : 2*2*Nc*madd*Ls
+  // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
+  // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
+  // Mpc => 1452*cbvol*2*Ls flops // 
+  //     => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
+  /////////////////////////////////////////////////////////////////////////////////////
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  //  long unsigned int single_site_flops     = 8*Nc*(7+16*Nc)*Ls;
+  long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
+  long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+
+  ColourMatrixF cm = ComplexF(1.0,0.0);
+
+  int ncall=300;
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+  double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+
+  LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
+  MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
+  
+  LatticeFermionF src_o (FrbGrid); src_o=1.0;
+  LatticeFermionF r_o   (FrbGrid); r_o=Zero();
+
+  int order =151;
+  SchurDiagOneOperator<MobiusFermionF,LatticeFermionF>  Mpc(Dw);
+  Chebyshev<LatticeFermionF>      Cheby(0.0,60.0,order);
+
+  {
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Mpc.Mpc(src_o,r_o);
+    }
+    double t1=usecond();
+
+    double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo  so CB cancels.
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*ncall);
+    std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
+
+    // Cheby uses MpcDagMpc so 2x flops
+    for(int i=0;i<1;i++){
+    Cheby(Mpc,src_o,r_o);
+    t0=usecond();
+    Cheby(Mpc,src_o,r_o);
+    t1=usecond();
+    flops=(single_site_mpc_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
+    std::cout <<std::endl;
+    }
+  }
+  //  Dw.Report();
+}
+
+
+
@@ -88,25 +88,6 @@ int main (int argc, char ** argv)
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  ref = Zero();
-  /*  
-  { // Naive wilson implementation
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
-      tmp = U[mu]*Cshift(src,mu,1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
-      }
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu,-1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
-      }
-    }
-  }
-  ref = -0.5*ref;
-  */

  RealD mass=0.1;
  RealD c1=9.0/8.0;
@@ -274,12 +274,20 @@ case ${ac_gen_scalar} in
 esac

 ##################### Compiler dependent choices
-case ${CXX} in 
+
+#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
+CXXBASE=${CXX}
+CXXTEST=${CXX}
+if echo "${CXX}" | grep -q "nvcc"; then
+  CXXTEST="nvcc"
+fi   
+
+case ${CXXTEST} in 
  nvcc) 
 #    CXX="nvcc -keep -v -x cu "
 #    CXXLD="nvcc -v -link"
-    CXX="nvcc -x cu "
-    CXXLD="nvcc -link"
+    CXX="${CXXBASE} -x cu "
+    CXXLD="${CXXBASE} -link"
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    if test $ac_openmp = yes; then