Merge pull request #4 from paboyle/develop

merge
2025-08-18 12:11:53 +01:00 · 2020-05-11 20:59:29 +02:00
parent 3c6ffcb48c ea08f193e7
commit b1c86900b2
16 changed files with 582 additions and 452 deletions
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -257,13 +257,11 @@ public:
      virtual  RealD Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
      tmp.Checkerboard() = !in.Checkerboard();
 	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
      }
@@ -366,6 +364,9 @@ public:
        void OpDir(const Field& in, Field& out, int dir, int disp) {
          assert(0);
        }
        void OpDirAll(const Field& in, std::vector<Field>& out){
          assert(0);
        };
    };
    template<class Matrix, class Field>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -234,10 +234,8 @@ public:
    GridBase *grid=in.Grid();
    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
    int vol=grid->gSites();
    typedef typename Field::vector_type vector_type;
    Field T0(grid); T0 = in;  
    Field T1(grid); 
@@ -260,12 +258,26 @@ public:
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
-      //     y=xscale*y+mscale*(*Tn);
+#if 0
-      //      *Tnp=2.0*y-(*Tnm);
+      auto y_v = y.View();
-      //      out=out+Coeffs[n]* (*Tnp);
+      auto Tn_v = Tn->View();
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
 #else
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
 	axpy(out,Coeffs[n],*Tnp,out);
      }
 #endif
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
      Tnm    =Tn;
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,21 +6,39 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
-#ifdef GRID_NVCC
+int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
-#define SMALL_LIMIT (0)
+#ifdef GRID_CUDA
 int PointerCache::Ncache      = 32;
 #else 
-#define SMALL_LIMIT (4096)
+int PointerCache::Ncache      = 8;
 #endif
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
 PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
-#ifdef POINTER_CACHE
+void PointerCache::Init(void)
-int PointerCache::victim;
+{
  char * str;
-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) Ncache = atoi(str);
  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
-void *PointerCache::Insert(void *ptr,size_t bytes) {
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
-
+  if ( str ) NcacheSmall = atoi(str);
-  if (bytes < SMALL_LIMIT ) return ptr;
+  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
 }
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
  return Insert(ptr,bytes,Entries,Ncache,Victim);  
 }
 void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
 {
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
@@ -28,8 +46,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
  void * ret = NULL;
  int v = -1;
-  for(int e=0;e<Ncache;e++) {
+  for(int e=0;e<ncache;e++) {
-    if ( Entries[e].valid==0 ) {
+    if ( entries[e].valid==0 ) {
      v=e; 
      break;
    }
@@ -37,40 +55,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
  if ( v==-1 ) {
    v=victim;
-    victim = (victim+1)%Ncache;
+    victim = (victim+1)%ncache;
  }
-  if ( Entries[v].valid ) {
+  if ( entries[v].valid ) {
-    ret = Entries[v].address;
+    ret = entries[v].address;
-    Entries[v].valid = 0;
+    entries[v].valid = 0;
-    Entries[v].address = NULL;
+    entries[v].address = NULL;
-    Entries[v].bytes = 0;
+    entries[v].bytes = 0;
  }
-  Entries[v].address=ptr;
+  entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
+  entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
+  entries[v].valid  =1;
  return ret;
 }
-void *PointerCache::Lookup(size_t bytes) {
+void *PointerCache::Lookup(size_t bytes)
-
+{
-  if (bytes < SMALL_LIMIT ) return NULL;
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
-
+    return Lookup(bytes,EntriesSmall,NcacheSmall);
  return Lookup(bytes,Entries,Ncache);
 }
 void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
 {
 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
 #endif 
-
+  for(int e=0;e<ncache;e++){
-  for(int e=0;e<Ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
-      Entries[e].valid = 0;
+      return entries[e].address;
      return Entries[e].address;
    }
  }
  return NULL;
 }
-#endif
+
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -42,21 +42,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #define GRID_ALLOC_SMALL_LIMIT (4096)
 NAMESPACE_BEGIN(Grid);
 // Move control to configure.ac and Config.h?
-#ifdef POINTER_CACHE
+
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
-#ifdef GRID_NVCC 
+/* Could make these configurable, perhaps up to a max size*/
-  static const int Ncache=128;
+  static const int NcacheSmallMax=128; 
-#else
+  static const int NcacheMax=16;
-  static const int Ncache=8;
+  static int NcacheSmall;
-#endif
+  static int Ncache;
  static int victim;
  typedef struct { 
    void *address;
@@ -64,15 +64,18 @@ private:
    int valid;
  } PointerCacheEntry;
-  static PointerCacheEntry Entries[Ncache];
+  static PointerCacheEntry Entries[NcacheMax];
  static int Victim;
  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
  static int VictimSmall;
 public:
-
+  static void Init(void);
  static void *Insert(void *ptr,size_t bytes) ;
  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
  static void *Lookup(size_t bytes) ;
-
+  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
 #endif  
 std::string sizeString(size_t bytes);
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
  if (heap_bytes >= heap_size) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
    std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
    std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
    assert(heap_bytes<heap_size);
  }
  //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);
 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  ret.Checkerboard()=lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
  Lattice<vobj> ret(lhs.Grid());
  ret.Checkerboard() = lhs.Checkerboard();
  auto lhs_v = lhs.View();
  auto ret_v = ret.View();
  accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -110,15 +110,15 @@ public:
 #endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
-  GridTime Elapsed(void) {
+  GridTime Elapsed(void) const {
    assert(running == false);
    return std::chrono::duration_cast<GridTime>( accumulator );
  }
-  uint64_t useconds(void){
+  uint64_t useconds(void) const {
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
-  bool isRunning(void){
+  bool isRunning(void) const {
    return running;
  }
 };
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -59,7 +59,7 @@ public:
  {
    RealD eps = 1.0;
-    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
    Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
    assert(zdata->n==this->Ls);
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  assert(mu>=0);
  assert(mu<Nd);
  int tshift = (mu == Nd-1) ? 1 : 0;
 #if 0
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // SHAMIR CASE 
  ////////////////////////////////////////////////
@@ -829,6 +829,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #endif
 #ifndef GRID_NVCC
  int tshift = (mu == Nd-1) ? 1 : 0;
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -159,6 +159,7 @@ const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 						    Complex(-1),
 						    Complex(-1)};
 //This is the old version
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
@@ -180,6 +181,10 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
    auto gD3 = GammaB_right * D3;
    auto D2g = D2 * GammaB_left;
    auto pD1g = pD1 * GammaB_left;
    auto gD3g = gD3 * GammaB_left;
    for (int ie_left=0; ie_left < 6 ; ie_left++){
      int a_left = epsilon[ie_left][0]; //a
      int b_left = epsilon[ie_left][1]; //b
@@ -188,58 +193,71 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
        int a_right = epsilon[ie_right][0]; //a'
        int b_right = epsilon[ie_right][1]; //b'
        int c_right = epsilon[ie_right][2]; //c'
 	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
        //This is the \delta_{456}^{123} part
 	if (wick_contraction[0]){
-          auto D2g = D2 * GammaB_left;
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
 	        result()()() += eepD1*D2g_ab*gD3_ab;
          }}}
  	}	  
        //This is the \delta_{456}^{231} part
 	if (wick_contraction[1]){
          auto pD1g = pD1 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
 	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
 		result()()() += eepD1g_gb*D2_ab*gD3_ag;
          }}}
        }	  
        //This is the \delta_{456}^{312} part
 	if (wick_contraction[2]){
          auto gD3g = gD3 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
 	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
 		result()()() += eepD1_gb*D2_ag*gD3g_ab;
          }}}
        }	  
        //This is the \delta_{456}^{132} part
 	if (wick_contraction[3]){
-          auto gD3g = gD3 * GammaB_left;
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
    		result()()() -= eepD1*D2_ab*gD3g_ab;
          }}}
        }	  
        //This is the \delta_{456}^{321} part
 	if (wick_contraction[4]){
          auto D2g = D2 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
 	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
 		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
          }}}
        }	  
        //This is the \delta_{456}^{213} part
 	if (wick_contraction[5]){
          auto pD1g = pD1 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
 	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
          }}}
        }	  
      }
@@ -259,6 +277,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -305,6 +327,10 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 const int parity,
 						 robj &result)
 {
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -318,7 +344,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
     result=Zero();
-     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 /***********************************************************************
@@ -558,6 +584,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
  GridBase *grid = qs_ti.Grid();
  auto vcorr= stn_corr.View();
@@ -595,6 +625,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
  GridBase *grid = qs_ti.Grid();
  auto vcorr= stn_corr.View();
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv)
  //////////////////////////////////////////////////////////
  GridGpuInit(); // Must come first to set device prior to MPI init
  PointerCache::Init();
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
 void GridParseLayout(char **argv,int argc,
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 using namespace Grid;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
@@ -76,7 +75,6 @@ struct controls {
  int Opt;
  int CommsOverlap;
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
  //  int HugePages;
 };
 class Benchmark {
@@ -119,14 +117,15 @@ public:
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
    comms_header();
-    for(int lat=4;lat<=maxlat;lat+=4){
+    for(int lat=16;lat<=maxlat;lat+=8){
-      for(int Ls=8;Ls<=8;Ls*=2){
+      //      for(int Ls=8;Ls<=8;Ls*=2){
      { int Ls=12;
 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-
+	std::cout << GridLogMessage<< latt_size <<std::endl;
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -184,9 +183,6 @@ public:
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
@@ -200,8 +196,6 @@ public:
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
@@ -227,14 +221,15 @@ public:
    uint64_t NN;
-  uint64_t lmax=48;
+  uint64_t lmax=32;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=4){
+    for(int lat=8;lat<=lmax;lat+=8){
      Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      //      NP= Grid.RankCount();
@@ -270,191 +265,8 @@ public:
    }
  };
 #if 0
  static double DWF5(int Ls,int L)
  {
    //    RealD mass=0.1;
    RealD M5  =1.8;
-    double mflops;
+  static double DWF(int Ls,int L)
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    Coordinate internal;
    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
    else assert(0);
    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); 
    LatticeFermion tmp   (sFGrid);
    std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
    random(RNG5,src);
    std::cout << GridLogMessage << "intialised random source" << std::endl;
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	 WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	 WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	uint64_t ncall = 1000;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      double robust = mflops_worst/mflops_best;;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 #endif
  static double DWF(int Ls,int L, double & robust)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
@@ -471,37 +283,30 @@ public:
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
-    Coordinate internal;
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
    else assert(0);
    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -514,74 +319,31 @@ public:
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    typedef DomainWallFermionF Action;
    typedef typename Action::FermionField Fermion;
    typedef LatticeGaugeFieldF Gauge;
    ///////// Source preparation ////////////
-    LatticeFermion src   (FGrid); random(RNG5,src);
+    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-    LatticeFermion ref   (FGrid);
+    Fermion src   (FGrid); random(RNG5,src);
-    LatticeFermion tmp   (FGrid);
+    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
    Fermion r_e   (FrbGrid);
    Fermion r_o   (FrbGrid);
    Fermion r_eo  (FGrid);
    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    RealD N2 = 1.0/::sqrt(norm2(src));
    std::cout<<GridLogMessage << "Normalising src  "<< N2 <<std::endl;
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    ////////////////////////////////////
    // Naive wilson implementation
    ////////////////////////////////////
    {
      LatticeGaugeField Umu5d(FGrid); 
      std::vector<LatticeColourMatrix> U(4,FGrid);
      auto Umu_v = Umu.View();
      auto Umu5d_v = Umu5d.View();
      for(int ss=0;ss<Umu.Grid()->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  Umu5d_v[Ls*ss+s] = Umu_v[ss];
 	}
      }
      ref = Zero();
      for(int mu=0;mu<Nd;mu++){
 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
      }
      for(int mu=0;mu<Nd;mu++){
 	tmp = U[mu]*Cshift(src,mu+1,1);
 	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 	tmp =adj(U[mu])*src;
 	tmp =Cshift(tmp,mu+1,-1);
 	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
      }
      ref = -0.5*ref;
    }
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    LatticeFermion r_eo  (FGrid);
    LatticeFermion err   (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
-#endif
+
      controls Cases [] = {
-#ifdef AVX512
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
@@ -594,15 +356,12 @@ public:
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	int nwarm = 200;
+	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -610,9 +369,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	uint64_t ncall = 50;
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
@@ -649,24 +406,11 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 	Dw.Report();
 	Dw.DhopEO(src_o,r_e,DaggerNo);
 	Dw.DhopOE(src_e,r_o,DaggerNo);
 	setCheckerboard(r_eo,r_o);
 	setCheckerboard(r_eo,r_e);
 	err = r_eo-ref; 
 	RealD absref = norm2(ref);
 	RealD abserr = norm2(err);
 	std::cout<<GridLogMessage << "norm diff   "<< abserr << " / " << absref<<std::endl;
 	assert(abserr<1.0e-4);
      }
-      robust = mflops_worst/mflops_best;
+
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
@@ -680,8 +424,166 @@ public:
    return mflops_best;
  }
  static double Staggered(int L)
  {
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
    Coordinate local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),
 								       GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    RealD mass=0.1;
    RealD c1=9.0/8.0;
    RealD c2=-1.0/24.0;
    RealD u0=1.0;
    typedef ImprovedStaggeredFermionF Action;
    typedef typename Action::FermionField Fermion; 
    typedef LatticeGaugeFieldF Gauge;
    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
    typename Action::ImplParams params;
    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
    ///////// Source preparation ////////////
    Fermion src   (FGrid); random(RNG4,src);
    Fermion src_e (FrbGrid);
    Fermion src_o (FrbGrid);
    Fermion r_e   (FrbGrid);
    Fermion r_o   (FrbGrid);
    Fermion r_eo  (FGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
      const int num_cases = 4;
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
      controls Cases [] = {
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
 	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	}
 	FGrid->Barrier();
 	double t1=usecond();
 	uint64_t ncall = 500;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	Ds.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  Ds.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	FGrid->Barrier();
 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1146.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
    }
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    return mflops_best;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -696,62 +598,50 @@ int main (int argc, char ** argv)
  int do_memory=1;
  int do_comms =1;
  int do_su3   =0;
  int do_wilson=1;
  int do_dwf   =1;
  if ( do_su3 ) {
    // empty for now
  }
 #if 1
  int sel=2;
-  Coordinate L_list({8,12,16,24});
+  std::vector<int> L_list({16,24,32});
 #else
  int sel=1;
  Coordinate L_list({8,12});
 #endif
  int selm1=sel-1;
  std::vector<double> robust_list;
  std::vector<double> wilson;
  std::vector<double> dwf4;
-  std::vector<double> dwf5;
+  std::vector<double> staggered;
  if ( do_wilson ) {
  int Ls=1;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
-      double robust;
+    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
      wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
    }
  }
-  int Ls=16;
+  Ls=12;
  if ( do_dwf ) {
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
-      double robust;
+    double result = Benchmark::DWF(Ls,L_list[l]) ;
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
    dwf4.push_back(result);
      robust_list.push_back(robust);
    }
  }
-  if ( do_dwf ) {
+  /*
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  for(int l=0;l<L_list.size();l++){
    double result = Benchmark::Staggered(L_list[l]) ;
    staggered.push_back(result);
  }
  */
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  int NN=NN_global;
  if ( do_memory ) {
@@ -768,7 +658,6 @@ int main (int argc, char ** argv)
    Benchmark::Comms();
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -782,10 +671,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
    std::cout<<std::setprecision(3);
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_schur.cc
+++ b/benchmarks/Benchmark_schur.cc
@@ -0,0 +1,176 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
 void benchDw(std::vector<int> & L, int Ls);
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=12;
  std::vector< std::vector<int> > latts;
 #if 1
  latts.push_back(std::vector<int> ({24,24,24,24}) );
  latts.push_back(std::vector<int> ({48,24,24,24}) );
  latts.push_back(std::vector<int> ({96,24,24,24}) );
  latts.push_back(std::vector<int> ({96,48,24,24}) );
  //  latts.push_back(std::vector<int> ({96,48,48,24}) );
  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
 #else
  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
  latts.push_back(std::vector<int> ({96,96,96,192}) );
 #endif
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  for (int l=0;l<latts.size();l++){
    std::vector<int> latt4 = latts[l];
    std::cout << GridLogMessage <<"\t";
    for(int d=0;d<Nd;d++){
      std::cout<<latt4[d]<<"x";
    }
    std::cout <<Ls<<"\t" ;
    benchDw (latt4,Ls);
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  Grid_finalize();
 }
 void benchDw(std::vector<int> & latt4, int Ls)
 {
  /////////////////////////////////////////////////////////////////////////////////////
  // for Nc=3
  /////////////////////////////////////////////////////////////////////////////////////
  // Dw :  Ls*24*(7+48)= Ls*1320 
  //
  // M5D:  Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
  // Meo:  Ls*24*(7+48) + Ls*72 = Ls*1392 
  //
  // Mee:  3*Ns*2*Nc*Ls  // Chroma 6*N5*Nc*Ns 
  //
  // LeemInv : 2*2*Nc*madd*Ls
  // LeeInv  : 2*2*Nc*madd*Ls
  // DeeInv  : 4*2*Nc*mul *Ls
  // UeeInv  : 2*2*Nc*madd*Ls
  // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
  // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
  // Mpc => 1452*cbvol*2*Ls flops // 
  //     => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
  /////////////////////////////////////////////////////////////////////////////////////
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  //  long unsigned int single_site_flops     = 8*Nc*(7+16*Nc)*Ls;
  long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
  long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  ColourMatrixF cm = ComplexF(1.0,0.0);
  int ncall=300;
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
  MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
  LatticeFermionF src_o (FrbGrid); src_o=1.0;
  LatticeFermionF r_o   (FrbGrid); r_o=Zero();
  int order =151;
  SchurDiagOneOperator<MobiusFermionF,LatticeFermionF>  Mpc(Dw);
  Chebyshev<LatticeFermionF>      Cheby(0.0,60.0,order);
  {
    Mpc.Mpc(src_o,r_o);
    Mpc.Mpc(src_o,r_o);
    Mpc.Mpc(src_o,r_o);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Mpc.Mpc(src_o,r_o);
    }
    double t1=usecond();
    double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo  so CB cancels.
    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
    flops=(single_site_quda_flops*volume*ncall);
    std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
    // Cheby uses MpcDagMpc so 2x flops
    for(int i=0;i<1;i++){
    Cheby(Mpc,src_o,r_o);
    t0=usecond();
    Cheby(Mpc,src_o,r_o);
    t1=usecond();
    flops=(single_site_mpc_flops*volume*2*order);
    std::cout <<"\t"<<flops/(t1-t0);
    flops=(single_site_quda_flops*volume*2*order);
    std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
    std::cout <<std::endl;
    }
  }
  //  Dw.Report();
 }
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -88,25 +88,6 @@ int main (int argc, char ** argv)
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  ref = Zero();
  /*  
  { // Naive wilson implementation
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
 	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
      }
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
 	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
      }
    }
  }
  ref = -0.5*ref;
  */
  RealD mass=0.1;
  RealD c1=9.0/8.0;
--- a/configure.ac
+++ b/configure.ac
@@ -274,12 +274,20 @@ case ${ac_gen_scalar} in
 esac
 ##################### Compiler dependent choices
-case ${CXX} in 
+
 #Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
 CXXBASE=${CXX}
 CXXTEST=${CXX}
 if echo "${CXX}" | grep -q "nvcc"; then
  CXXTEST="nvcc"
 fi   
 case ${CXXTEST} in 
  nvcc) 
 #    CXX="nvcc -keep -v -x cu "
 #    CXXLD="nvcc -v -link"
-    CXX="nvcc -x cu "
+    CXX="${CXXBASE} -x cu "
-    CXXLD="nvcc -link"
+    CXXLD="${CXXBASE} -link"
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
    if test $ac_openmp = yes; then