Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2026-05-26 03:54:17 +01:00 · 2021-10-26 10:45:46 +01:00
parent c144b32368 ba7e371b90
commit fd13a3f2be
31 changed files with 1083 additions and 236 deletions
@@ -52,6 +52,7 @@ public:
  virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base
  virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
  virtual void HermOp(const Field &in, Field &out)=0;
  virtual ~LinearOperatorBase(){};
 };
@@ -48,6 +48,7 @@ public:
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
  virtual ~SparseMatrixBase() {};
 };
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
  virtual  void MeooeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeDag    (const Field &in, Field &out)=0;
  virtual  void MooeeInvDag (const Field &in, Field &out)=0;
-
+  virtual ~CheckerBoardedSparseMatrixBase() {};
 };
 NAMESPACE_END(Grid);
@@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
    typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
    typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
-  public:                                                
+  public:
    using LinearFunction<FieldD>::operator();
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
@@ -33,16 +33,19 @@ namespace Grid {
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
@@ -57,6 +60,7 @@ private:
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
@@ -101,6 +105,7 @@ private:
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
 #undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
  cacheBytes = CacheBytes[Cpu];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Acc];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
  cacheBytes = CacheBytes[Shared];
  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
 #ifdef GRID_CUDA
  cuda_mem();
 #endif
 }
 //////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
-
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
  total_device+=bytes;
  void *ptr = (void *) Lookup(bytes,Acc);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocDevice(bytes);
    total_device+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
  total_device-=bytes;
  void *__freeme = Insert(ptr,bytes,Acc);
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"AcceleratorFree "<<std::endl;
  PrintBytes();
 #endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
  total_shared+=bytes;
  void *ptr = (void *) Lookup(bytes,Shared);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
  total_shared-=bytes;
  void *__freeme = Insert(ptr,bytes,Shared);
  if ( __freeme ) {
    acceleratorFreeShared(__freeme);
    total_shared-=bytes;
    //    PrintBytes();
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"SharedFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
  total_host+=bytes;
  void *ptr = (void *) Lookup(bytes,Cpu);
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocCpu(bytes);
    total_host+=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuAllocate "<<std::endl;
  PrintBytes();
 #endif
  return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
  total_host-=bytes;
  NotifyDeletion(_ptr);
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeCpu(__freeme);
    total_host-=bytes;
  }
 #ifdef GRID_MM_VERBOSE
  std::cout <<"CpuFree "<<std::endl;
  PrintBytes();
 #endif
 }
 #endif
@@ -115,7 +159,6 @@ void MemoryManager::Init(void)
  char * str;
  int Nc;
  int NcS;
  str= getenv("GRID_ALLOC_NCACHE_LARGE");
  if ( str ) {
@@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
  return ptr;
 #endif
 }
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  if ( entries[v].valid ) {
    ret = entries[v].address;
    cacheBytes -= entries[v].bytes;
    entries[v].valid = 0;
    entries[v].address = NULL;
    entries[v].bytes = 0;
@@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
  entries[v].address=ptr;
  entries[v].bytes  =bytes;
  entries[v].valid  =1;
  cacheBytes += bytes;
  return ret;
 }
@@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
  bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
  int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
  return NULL;
 #endif
 }
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
  assert(ncache>0);
 #ifdef GRID_OMP
@@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
  for(int e=0;e<ncache;e++){
    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
      entries[e].valid = 0;
      cacheBytes -= entries[e].bytes;
      return entries[e].address;
    }
  }
@@ -82,14 +82,15 @@ private:
  static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
  static int Victim[NallocType];
  static int Ncache[NallocType];
  static uint64_t CacheBytes[NallocType];
  /////////////////////////////////////////////////
  // Free pool
  /////////////////////////////////////////////////
  static void *Insert(void *ptr,size_t bytes,int type) ;
  static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
  static void PrintBytes(void);
 public:
@@ -3,7 +3,7 @@
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
@@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
  PrintBytes();
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
  std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
  std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@@ -389,7 +389,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    void *shm = (void *) this->ShmBufferTranslate(dest,recv);
    assert(shm!=NULL);
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
    acceleratorCopySynchronise(); // MPI prob slower
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -405,6 +404,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
  if (nreq==0) return;
  std::vector<MPI_Status> status(nreq);
  acceleratorCopySynchronise(); 
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
  list.resize(0);
@@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
  if (warpSize != WARP_SIZE) {
@@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
  // let the number of threads in a block be a multiple of 2, starting from warpSize
  threads = warpSize;
  if ( threads*sizeofsobj > sharedMemPerBlock ) {
    std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
    exit(EXIT_FAILURE);
  }
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
  // keep all the streaming multiprocessors busy
  blocks = nextPow2(multiProcessorCount);
@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
  });
 }
 template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
 {
  half.Checkerboard() = cb;
  autoView(half_v, half, AcceleratorWrite);
  autoView(full_v, full, AcceleratorRead);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++) {
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(half_v[ssh],full_v(ss));
    }
  });
 }
 template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
 {
  int cb = half.Checkerboard();
  autoView(half_v , half, AcceleratorRead);
  autoView(full_v , full, AcceleratorWrite);
  Coordinate rdim_full             = full.Grid()->_rdimensions;
  Coordinate rdim_half             = half.Grid()->_rdimensions;
  unsigned long ndim_half          = half.Grid()->_ndimension;
  Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
  Coordinate ostride_half          = half.Grid()->_ostride;
  accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
    Coordinate coor;
    int cbos;
    int linear=0;
    Lexicographic::CoorFromIndex(coor,ss,rdim_full);
    assert(coor.size()==ndim_half);
    for(int d=0;d<ndim_half;d++){ 
      if(checker_dim_mask_half[d]) linear += coor[d];
    }
    cbos = (linear&0x1);
    if (cbos==cb) {
      int ssh=0;
      for(int d=0;d<ndim_half;d++){
        if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
        else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
      }
      coalescedWrite(full_v[ss],half_v(ssh));
    }
  });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
@@ -576,6 +576,7 @@ class ScidacReader : public GridLimeReader {
    std::string rec_name(ILDG_BINARY_DATA);
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
  skipPastObjectRecord(std::string(GRID_FIELD_NORM));
 	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 	return;
      }
@@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define REGISTER
 #ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane);		\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane);		\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane);		\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane);		\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane);		\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane);		\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane);		\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane);		\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane);		\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane);		\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+    Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane);	}
 #define PERMUTE_DIR(dir) ;
 #else
-#define LOAD_CHIMU(ptype)		\
+#define LOAD_CHIMU(Ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_32=ref()(3)(2);}
 #define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
+  permute##dir(Chi_00,Chi_00);			\
-      permute##dir(Chi_01,Chi_01);\
+  permute##dir(Chi_01,Chi_01);			\
-      permute##dir(Chi_02,Chi_02);\
+  permute##dir(Chi_02,Chi_02);			\
-      permute##dir(Chi_10,Chi_10);	\
+  permute##dir(Chi_10,Chi_10);			\
-      permute##dir(Chi_11,Chi_11);\
+  permute##dir(Chi_11,Chi_11);			\
-      permute##dir(Chi_12,Chi_12);
+  permute##dir(Chi_12,Chi_12);
 #endif
@@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_32-= UChi_12;
 #define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  {int ptype;					\
-  offset = SE->_offset;				\
+   SE=st.GetEntry(ptype,DIR,ss);		\
-  local  = SE->_is_local;			\
+   auto offset = SE->_offset;			\
-  perm   = SE->_permute;			\
+   auto local  = SE->_is_local;			\
-  if ( local ) {				\
+   auto perm   = SE->_permute;			\
-    LOAD_CHIMU(PERM);				\
+   if ( local ) {				\
-    PROJ;					\
+     LOAD_CHIMU(PERM);				\
-    if ( perm) {				\
+     PROJ;					\
-      PERMUTE_DIR(PERM);			\
+     if ( perm) {				\
-    }						\
+       PERMUTE_DIR(PERM);			\
-  } else {					\
+     }						\
-    LOAD_CHI;					\
+   } else {					\
-  }						\
+     LOAD_CHI;					\
-  acceleratorSynchronise();			\
+   }						\
-  MULT_2SPIN(DIR);				\
+   acceleratorSynchronise();			\
-  RECON;					
+   MULT_2SPIN(DIR);				\
   RECON;					}
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)		\
-  SE=&st_p[DIR+8*ss];				\
+  { SE=&st_p[DIR+8*ss];						\
-  ptype=st_perm[DIR];				\
+  auto ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
+  auto offset = SE->_offset;					\
-  local  = SE->_is_local;			\
+  auto local  = SE->_is_local;					\
-  perm   = SE->_permute;			\
+  auto perm   = SE->_permute;					\
-  if ( local ) {				\
+  if ( local ) {						\
-    LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);						\
-    PROJ;					\
+    PROJ;							\
-    if ( perm) {				\
+    if ( perm) {						\
-      PERMUTE_DIR(PERM);			\
+      PERMUTE_DIR(PERM);					\
-    }						\
+    }								\
-  } else {					\
+  } else {							\
-    LOAD_CHI;					\
+    LOAD_CHI;							\
-  }						\
+  }								\
-  acceleratorSynchronise();			\
+  acceleratorSynchronise();					\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN(DIR);						\
-  RECON;					
+  RECON;					}
 #define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
-  SE=&st_p[DIR+8*ss];							\
+  { SE=&st_p[DIR+8*ss];							\
-  ptype=st_perm[DIR];							\
+    auto ptype=st_perm[DIR];						\
- /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+    /*SE=st.GetEntry(ptype,DIR,ss);*/					\
-  offset = SE->_offset;				\
+    auto offset = SE->_offset;						\
-  perm   = SE->_permute;			\
+    auto perm   = SE->_permute;						\
-  LOAD_CHIMU(PERM);				\
+    LOAD_CHIMU(PERM);							\
-  PROJ;						\
+    PROJ;								\
-  MULT_2SPIN(DIR);				\
+    MULT_2SPIN(DIR);							\
-  RECON;					
+    RECON;					}
 #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  local  = SE->_is_local;			\
+  auto offset = SE->_offset;					\
-  perm   = SE->_permute;			\
+  auto local  = SE->_is_local;					\
-  if ( local ) {				\
+  auto perm   = SE->_permute;					\
-    LOAD_CHIMU(PERM);				\
+  if ( local ) {						\
-    PROJ;					\
+    LOAD_CHIMU(PERM);						\
-    if ( perm) {				\
+    PROJ;							\
-      PERMUTE_DIR(PERM);			\
+    if ( perm) {						\
-    }						\
+      PERMUTE_DIR(PERM);					\
-  } else if ( st.same_node[DIR] ) {		\
+    }								\
-    LOAD_CHI;					\
+  } else if ( st.same_node[DIR] ) {				\
-  }						\
+    LOAD_CHI;							\
-  acceleratorSynchronise();			\
+  }								\
-  if (local || st.same_node[DIR] ) {		\
+  acceleratorSynchronise();					\
-    MULT_2SPIN(DIR);				\
+  if (local || st.same_node[DIR] ) {				\
-    RECON;					\
+    MULT_2SPIN(DIR);						\
-  }						\
+    RECON;							\
-  acceleratorSynchronise();			
+  }								\
  acceleratorSynchronise();			}
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
+  { int ptype;						\
-  offset = SE->_offset;				\
+  SE=st.GetEntry(ptype,DIR,ss);				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
+  auto offset = SE->_offset;				\
-    LOAD_CHI;					\
+  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {		\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI;						\
-    RECON;					\
+    MULT_2SPIN(DIR);					\
-    nmu++;					\
+    RECON;						\
-  }						\
+    nmu++;						\
-  acceleratorSynchronise();			
+  }							\
  acceleratorSynchronise();			}
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss)					\
-  {						\
+  {							\
-    SiteSpinor & ref (out[ss]);			\
+    SiteSpinor & ref (out[ss]);				\
    coalescedWrite(ref()(0)(0),result_00,lane);		\
    coalescedWrite(ref()(0)(1),result_01,lane);		\
    coalescedWrite(ref()(0)(2),result_02,lane);		\
@@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -640,8 +638,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
  int offset,local,perm, ptype;
  StencilEntry *SE;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
  int offset,local,perm, ptype;
  ZERO_RESULT;
  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -699,8 +695,8 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
  HAND_DECLARATIONS(Simt);
-  int offset, ptype;
+  //  int offset, ptype;
  StencilEntry *SE;
  int nmu=0;
  ZERO_RESULT;
@@ -730,8 +726,8 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
-  auto st_p = st._entries_p;						
+  //  auto st_p = st._entries_p;						
-  auto st_perm = st._permute_type;					
+  //  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
  HAND_DECLARATIONS(Simt);
  StencilEntry *SE;
-  int offset, ptype;
+  //  int offset, ptype;
  int nmu=0;
  ZERO_RESULT;
  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -78,6 +78,8 @@ public:
  typedef Lattice<SiteLink>    LinkField; 
  typedef Lattice<SiteField>   Field;
  typedef SU<Nrepresentation> Group;
  // Guido: we can probably separate the types from the HMC functions
  // this will create 2 kind of implementations
  // probably confusing the users
@@ -118,7 +120,7 @@ public:
    LinkField Pmu(P.Grid());
    Pmu = Zero();
    for (int mu = 0; mu < Nd; mu++) {
-      SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
      Pmu = Pmu*scale;
      PokeIndex<LorentzIndex>(P, Pmu, mu);
@@ -159,15 +161,15 @@ public:
  }
  static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::HotConfiguration(pRNG, U);
+    Group::HotConfiguration(pRNG, U);
  }
  static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::TepidConfiguration(pRNG, U);
+    Group::TepidConfiguration(pRNG, U);
  }
  static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
-    SU<Nc>::ColdConfiguration(pRNG, U);
+    Group::ColdConfiguration(pRNG, U);
  }
 };
@@ -1,61 +1,63 @@
-Using HMC in Grid version 0.5.1
+# Using HMC in Grid
-These are the instructions to use the Generalised HMC on Grid version 0.5.1.
+These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
-Disclaimer: GRID is still under active development so any information here can be changed in future releases.
+Disclaimer: Grid is still under active development so any information here can be changed in future releases.
-Command line options
+## Command line options
-===================
+
-(relevant file GenericHMCrunner.h)
+(relevant file `GenericHMCrunner.h`)
 The initial configuration can be changed at the command line using 
--StartType <your choice>
+`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
-valid choices, one among these
+`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
-HotStart, ColdStart, TepidStart, CheckpointStart
+Default: `--StartingType HotStart`
 default: HotStart
-example
+Example:
-./My_hmc_exec  --StartType HotStart
+```
 ./My_hmc_exec  --StartingType HotStart
 ```
-The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
+The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
+`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
-default: 0
+Default: `--StartingTrajectory 0`
 The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
+`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
-default: 1
+Default: `--Trajectories 1`
 The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
+`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
-default: 10
+Default: `--Thermalizations 10`
 Any other parameter is defined in the source for the executable.
-HMC controls
+## HMC controls
 ===========
 The lines 
 ```
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
  std::vector<int> ParSeed({6, 7, 8, 9, 10});
 ```
 define the seeds for the serial and the parallel RNG.
 The line 
 ```
  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
 ```
 declares the number of molecular dynamics steps and the total trajectory length.
-Actions
+## Actions
 ======
-Action names are defined in the file
+Action names are defined in the directory `Grid/qcd/action`.
 lib/qcd/Actions.h
-Gauge actions list:
+Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
 ```
 WilsonGaugeActionR;
 WilsonGaugeActionF;
 WilsonGaugeActionD;
@@ -68,8 +70,9 @@ IwasakiGaugeActionD;
 SymanzikGaugeActionR;
 SymanzikGaugeActionF;
 SymanzikGaugeActionD;
 ```
-
+```
 ConjugateWilsonGaugeActionR;
 ConjugateWilsonGaugeActionF;
 ConjugateWilsonGaugeActionD;
@@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
 ConjugateSymanzikGaugeActionR;
 ConjugateSymanzikGaugeActionF;
 ConjugateSymanzikGaugeActionD;
 ```
 Each of these action accepts one single parameter at creation time (beta).
 Example for creating a Symanzik action with beta=4.0
 ```
  SymanzikGaugeActionR(4.0)
 ```
 Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
 ```
 ScalarActionR;
 ScalarActionF;
 ScalarActionD;
 ```
-
+The suffixes `R`, `F`, `D` in the action names refer to the `Real`
-each of these action accept one single parameter at creation time (beta).
+(the precision is defined at compile time by the `--enable-precision` flag in the configure),
-Example for creating a Symanzik action with beta=4.0
+`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
 	SymanzikGaugeActionR(4.0)
 The suffixes R,F,D in the action names refer to the Real
 (the precision is defined at compile time by the --enable-precision flag in the configure),
 Float and Double, that force the precision of the action to be 32, 64 bit respectively.
@@ -322,8 +322,8 @@ public:
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
-    int recv_from_rank;
+    //    int recv_from_rank;
-    int xmit_to_rank;
+    //    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    if ( displacement == 0 ) return 1;
@@ -74,11 +74,13 @@ void acceleratorInit(void)
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
    }
  }
  MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 #ifdef GRID_DEFAULT_GPU
  int device = 0;
  // IBM Jsrun makes cuda Device numbering screwy and not match rank
  if ( world_rank == 0 ) {
    printf("AcceleratorCudaInit: using default device \n");
@@ -87,10 +89,20 @@ void acceleratorInit(void)
    printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
  }
 #else
  int device = rank;
  printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
  printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
  cudaSetDevice(rank);
 #endif
  cudaSetDevice(device);
  cudaStreamCreate(&copyStream);
  const int len=64;
  char busid[len];
  if( rank == world_rank ) { 
    cudaDeviceGetPCIBusId(busid, len, device);
    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
  }
  if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
@@ -95,6 +95,7 @@ void     acceleratorInit(void);
 //////////////////////////////////////////////
 #ifdef GRID_CUDA
 #include <cuda.h>
 #ifdef __CUDA_ARCH__
@@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific
 inline void cuda_mem(void)
 {
  size_t free_t,total_t,used_t;
  cudaMemGetInfo(&free_t,&total_t);
  used_t=total_t-free_t;
  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
 }
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
    int nt=acceleratorThreads();					\
@@ -137,7 +137,7 @@ int main (int argc, char ** argv)
  Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
  Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
-  double          n = BENCH_IO_NPASS;
+  //  double          n = BENCH_IO_NPASS;
  stats(mean, stdDev, perf);
  stats(avMean, avStdDev, avPerf);
@@ -164,7 +164,7 @@ int main (int argc, char ** argv)
                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
  }
  MSG << std::endl;
-  MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%4s %12s %12s %12s %12s\n",
              "L", "std read", "std write", "Grid read", "Grid write");
@@ -185,7 +185,7 @@ int main (int argc, char ** argv)
              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
  MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
  MSG << std::endl;
  grid_printf("%12s %12s %12s %12s\n",
              "std read", "std write", "Grid read", "Grid write");
@@ -142,7 +142,7 @@ public:
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
-	int ncomm;
+	//	int ncomm;
 	double dbytes;
        for(int dir=0;dir<8;dir++) {
@@ -290,7 +290,7 @@ public:
      LatticeSU4 z(&Grid); z=Zero();
      LatticeSU4 x(&Grid); x=Zero();
      LatticeSU4 y(&Grid); y=Zero();
-      double a=2.0;
+      //      double a=2.0;
      uint64_t Nloop=NLOOP;
@@ -72,7 +72,7 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
  std::vector<double> t_time(Nloop);
-  time_statistics timestat;
+  //  time_statistics timestat;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
@@ -184,8 +184,10 @@ int main (int argc, char ** argv)
      double bytes=1.0*vol*Nvec*sizeof(Real);
      double flops=vol*Nvec*2;// mul,add
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
+      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
-
+	       <<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
 	       <<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
      assert(nn==nn);
  }    
  Grid_finalize();
@@ -9,6 +9,7 @@ using namespace std;
 using namespace Grid;
 typedef SpinColourMatrix Propagator;
 typedef SpinColourVector Fermion;
 typedef PeriodicGimplR   GimplR;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
@@ -55,6 +56,16 @@ void MakePhase(Coordinate mom,LatticeComplex &phase)
  }
  phase = exp(phase*ci);
 }
 void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
 {
  Smear_Stout<GimplR> Stout(rho);
  LatticeGaugeField Utmp(Uin.Grid());
  Utmp = Uin;
  for(int i=0;i<nstep;i++){
    Stout.smear(Usmr,Utmp);
    Utmp = Usmr;
  }
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
@@ -97,23 +108,23 @@ void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
 {
  Real alpha=0.05;
-  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(U);
+  Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
  std::cout << " Initial plaquette "<<plaq << std::endl;
  LatticeColourMatrix   xform(U.Grid()); 
  Ufix = U;
  int orthog=Nd-1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog);
+  FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
-  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Ufix);
+  plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
  std::cout << " Final plaquette "<<plaq << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
+  typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
@@ -167,19 +178,21 @@ void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
-  LatticeFermion src4  (UGrid); 
+  LatticeFermion src4  (UGrid); src4 = Zero();
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
+  ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
-
+      std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
      D.ImportPhysicalFermionSource(src4,src5);
      std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
      result5=Zero();
      schur(D,src5,result5,ZG);
@@ -287,15 +300,10 @@ int main (int argc, char ** argv)
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
-  LatticeGaugeField Ufixed(UGrid);
+  LatticeGaugeField Utmp(UGrid);
  LatticeGaugeField Usmr(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
@@ -308,13 +316,20 @@ int main (int argc, char ** argv)
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
-    //    SU<Nc>::HotConfiguration(RNG4,Umu);
+    config="ColdConfig";
    config="HotConfig";
  }
-  GaugeFix(Umu,Ufixed);
+  //  GaugeFix(Umu,Utmp);
-  Umu=Ufixed;
+  //  Umu=Utmp;
  int nsmr=3;
  RealD rho=0.1;
  RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
  LinkSmear(nsmr,rho,Umu,Usmr);
  RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
  std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
  std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
  std::vector<int>   smeared_link({ 0,0,1} ); 
  std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
  std::vector<RealD> M5s   ({ 1.8,1.8,1.0} ); 
  std::vector<RealD> bs   ({ 1.0,1.0,1.5} );  // DDM
@@ -330,6 +345,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  typedef MobiusFermionR FermionAction;
  FermionAction::ImplParams Params(boundary);
  for(int m=0;m<masses.size();m++) {
@@ -339,30 +357,40 @@ int main (int argc, char ** argv)
    RealD c    = cs[m];
    int   Ls   = Ls_s[m];
    if ( smeared_link[m] ) Utmp = Usmr;
    else                   Utmp = Umu;
    FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
    FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
-    FermActs.push_back(new MobiusFermionR(Umu,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c));
+    FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator z2wall_source(UGrid);
  LatticePropagator gfwall_source(UGrid);
-  Coordinate Origin({0,0,0,0});
+  int tslice = 0;
-  PointSource   (Origin,point_source);
+  //////////////////////////////////////////////////////////////////////
-  Z2WallSource  (RNG4,0,z2wall_source);
+  // RNG seeded for Z2 wall
-  GFWallSource  (0,gfwall_source);
+  //////////////////////////////////////////////////////////////////////
-  
+  // You can manage seeds however you like.
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
+  // Recommend SeedUniqueString.
-  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
+  //////////////////////////////////////////////////////////////////////
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
  Z2WallSource  (RNG4,tslice,z2wall_source);
  GFWallSource  (tslice,gfwall_source);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  std::vector<LatticePropagator> GFProps   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
    Solve(*FermActs[m],z2wall_source    ,Z2Props[m]);
    std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
    Solve(*FermActs[m],gfwall_source    ,GFProps[m]);
    std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
  }
@@ -383,14 +411,15 @@ int main (int argc, char ** argv)
    std::stringstream wssg,wssz;
    /// Point sinks
-    ssg<<config<< "_m" << m1 << "_m"<< m2 << "p_gf_meson.xml";
+    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "p_z2_meson.xml";
+    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
    MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
    /// Wall sinks
-    wssg<<config<< "_m" << m1 << "_m"<< m2 << "w_gf_meson.xml";
+    wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
-    wssz<<config<< "_m" << m1 << "_m"<< m2 << "w_z2_meson.xml";
+    wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
    WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
    WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
@@ -0,0 +1,179 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 1073741824 byte stencil comms buffers 
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 Grid : Message : MemoryManager Cache 13529146982 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 2.137929 s : Grid is setup to use 6 threads
 Grid : Message : 2.137941 s : Number of iterations to average: 250
 Grid : Message : 2.137950 s : ====================================================================================================
 Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory 
 Grid : Message : 2.137966 s : ====================================================================================================
 Grid : Message : 2.137974 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.604949 s :    8	8	     393216       89973.9  		179947.8
 Grid : Message : 2.668249 s :    8	8	     393216       18650.3  		37300.5
 Grid : Message : 2.732288 s :    8	8	     393216       18428.5  		36857.1
 Grid : Message : 2.753565 s :    8	8	     393216       55497.2  		110994.4
 Grid : Message : 2.808960 s :   12	8	    1327104       100181.5  		200363.0
 Grid : Message : 3.226900 s :   12	8	    1327104       20600.5  		41201.0
 Grid : Message : 3.167459 s :   12	8	    1327104       24104.6  		48209.2
 Grid : Message : 3.227660 s :   12	8	    1327104       66156.7  		132313.5
 Grid : Message : 3.413570 s :   16	8	    3145728       56174.4  		112348.8
 Grid : Message : 3.802697 s :   16	8	    3145728       24255.9  		48511.7
 Grid : Message : 4.190498 s :   16	8	    3145728       24336.7  		48673.4
 Grid : Message : 4.385171 s :   16	8	    3145728       48484.1  		96968.2
 Grid : Message : 4.805284 s :   20	8	    6144000       46380.5  		92761.1
 Grid : Message : 5.562975 s :   20	8	    6144000       24328.5  		48656.9
 Grid : Message : 6.322562 s :   20	8	    6144000       24266.7  		48533.4
 Grid : Message : 6.773598 s :   20	8	    6144000       40868.5  		81736.9
 Grid : Message : 7.600999 s :   24	8	   10616832       40198.3  		80396.6
 Grid : Message : 8.912917 s :   24	8	   10616832       24279.5  		48559.1
 Grid : Message : 10.220961 s :   24	8	   10616832       24350.2  		48700.4
 Grid : Message : 11.728250 s :   24	8	   10616832       37390.9  		74781.8
 Grid : Message : 12.497258 s :   28	8	   16859136       36792.2  		73584.5
 Grid : Message : 14.585387 s :   28	8	   16859136       24222.2  		48444.3
 Grid : Message : 16.664783 s :   28	8	   16859136       24323.4  		48646.8
 Grid : Message : 17.955238 s :   28	8	   16859136       39194.7  		78389.4
 Grid : Message : 20.136479 s :   32	8	   25165824       35718.3  		71436.5
 Grid : Message : 23.241958 s :   32	8	   25165824       24311.4  		48622.9
 Grid : Message : 26.344810 s :   32	8	   25165824       24331.9  		48663.7
 Grid : Message : 28.384420 s :   32	8	   25165824       37016.3  		74032.7
 Grid : Message : 28.388879 s : ====================================================================================================
 Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory 
 Grid : Message : 28.388909 s : ====================================================================================================
 Grid : Message : 28.388924 s :  L  	 Ls  	    bytes		MB/s uni	MB/s bidi
 Grid : Message : 28.553993 s :    8	8	     393216       8272.4  		16544.7
 Grid : Message : 28.679592 s :    8	8	     393216       9395.4  		18790.8
 Grid : Message : 28.811112 s :    8	8	     393216       8971.0  		17942.0
 Grid : Message : 28.843770 s :    8	8	     393216       36145.6  		72291.2
 Grid : Message : 28.981754 s :   12	8	    1327104       49591.6  		99183.2
 Grid : Message : 29.299764 s :   12	8	    1327104       12520.8  		25041.7
 Grid : Message : 29.620288 s :   12	8	    1327104       12422.2  		24844.4
 Grid : Message : 29.657645 s :   12	8	    1327104       106637.5  		213275.1
 Grid : Message : 29.952933 s :   16	8	    3145728       43939.2  		87878.5
 Grid : Message : 30.585411 s :   16	8	    3145728       14922.1  		29844.2
 Grid : Message : 31.219781 s :   16	8	    3145728       14877.2  		29754.4
 Grid : Message : 31.285017 s :   16	8	    3145728       144724.3  		289448.7
 Grid : Message : 31.706443 s :   20	8	    6144000       54676.2  		109352.4
 Grid : Message : 32.739205 s :   20	8	    6144000       17848.0  		35696.1
 Grid : Message : 33.771852 s :   20	8	    6144000       17849.9  		35699.7
 Grid : Message : 33.871981 s :   20	8	    6144000       184141.4  		368282.8
 Grid : Message : 34.536808 s :   24	8	   10616832       55784.3  		111568.6
 Grid : Message : 36.275648 s :   24	8	   10616832       18317.6  		36635.3
 Grid : Message : 37.997181 s :   24	8	   10616832       18501.7  		37003.4
 Grid : Message : 38.140442 s :   24	8	   10616832       222383.9  		444767.9
 Grid : Message : 39.177222 s :   28	8	   16859136       56609.7  		113219.4
 Grid : Message : 41.874755 s :   28	8	   16859136       18749.9  		37499.8
 Grid : Message : 44.529381 s :   28	8	   16859136       19052.9  		38105.8
 Grid : Message : 44.742192 s :   28	8	   16859136       237717.1  		475434.2
 Grid : Message : 46.184000 s :   32	8	   25165824       57091.2  		114182.4
 Grid : Message : 50.734740 s :   32	8	   25165824       19411.0  		38821.9
 Grid : Message : 53.931228 s :   32	8	   25165824       19570.6  		39141.2
 Grid : Message : 54.238467 s :   32	8	   25165824       245765.6  		491531.2
 Grid : Message : 54.268664 s : ====================================================================================================
 Grid : Message : 54.268680 s : = All done; Bye Bye
 Grid : Message : 54.268691 s : ====================================================================================================
@@ -0,0 +1,14 @@
 ../../configure --enable-comms=mpi \
 	      --enable-simd=GPU \
 	      --enable-gen-simd-width=32 \
 	      --enable-unified=no \
 	       --enable-shm=nvlink \
 	       --disable-gparity \
 	       --enable-setdevice \
 	       --disable-fermion-reps \
 	       --enable-accelerator=cuda \
 	       --prefix /ccs/home/paboyle/prefix \
 	       CXX=nvcc \
 	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
 	       CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
@@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.731905 s : Grid Layout
 Grid : Message : 1.731915 s : 	Global lattice size  : 48 48 48 72 
 Grid : Message : 1.731928 s : 	OpenMP threads       : 6
 Grid : Message : 1.731938 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.683494 s : Making s innermost grids
 Grid : Message : 2.780034 s : Initialising 4d RNG
 Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.916841 s : Initialising 5d RNG
 Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 5.264345 s : Initialised RNGs
 Grid : Message : 6.489904 s : Drawing gauge field
 Grid : Message : 6.729262 s : Random gauge initialised 
 Grid : Message : 7.781273 s : Setting up Cshift based reference 
 Grid : Message : 8.725313 s : *****************************************************************
 Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 8.725342 s : *****************************************************************
 Grid : Message : 8.725352 s : *****************************************************************
 Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 8.725372 s : * Vectorising space-time by 4
 Grid : Message : 8.725383 s : * VComplexF size is 32 B
 Grid : Message : 8.725395 s : * SINGLE precision 
 Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute
 Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 8.725425 s : *****************************************************************
 Grid : Message : 9.465229 s : Called warmup
 Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us
 Grid : Message : 58.646121 s : mflop/s =   1.02592e+07
 Grid : Message : 58.646134 s : mflop/s per rank =  427468
 Grid : Message : 58.646145 s : mflop/s per node =  2.56481e+06
 Grid : Message : 58.646156 s : RF  GiB/s (base 2) =   20846.5
 Grid : Message : 58.646166 s : mem GiB/s (base 2) =   13029.1
 Grid : Message : 58.648008 s : norm diff   1.04778e-13
 Grid : Message : 58.734885 s : #### Dhop calls report 
 Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 58.734909 s : WilsonFermion5D TotalTime   /Calls        : 8217.71 us
 Grid : Message : 58.734922 s : WilsonFermion5D CommTime    /Calls        : 7109.5 us
 Grid : Message : 58.734933 s : WilsonFermion5D FaceTime    /Calls        : 446.623 us
 Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls        : 18.0558 us
 Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls        : 731.097 us
 Grid : Message : 58.734979 s : Average mflops/s per call                : 4.8157e+09
 Grid : Message : 58.734989 s : Average mflops/s per call per rank       : 2.00654e+08
 Grid : Message : 58.734999 s : Average mflops/s per call per node       : 1.20393e+09
 Grid : Message : 58.735008 s : Average mflops/s per call (full)         : 1.04183e+07
 Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094
 Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06
 Grid : Message : 58.735035 s : WilsonFermion5D Stencil
 Grid : Message : 58.735043 s : WilsonFermion5D StencilEven
 Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd
 Grid : Message : 58.735059 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 64.934740 s : Called DwDag
 Grid : Message : 64.934870 s : norm dag result 12.0422
 Grid : Message : 64.120756 s : norm dag ref    12.0422
 Grid : Message : 64.149389 s : norm dag diff   7.6644e-14
 Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 64.465331 s : src_e0.499995
 Grid : Message : 64.524653 s : src_o0.500005
 Grid : Message : 64.558706 s : *********************************************************
 Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 64.558727 s : * Vectorising space-time by 4
 Grid : Message : 64.558737 s : * SINGLE precision 
 Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute
 Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 64.558761 s : *********************************************************
 Grid : Message : 92.702145 s : Deo mflop/s =   8.97692e+06
 Grid : Message : 92.702185 s : Deo mflop/s per rank   374038
 Grid : Message : 92.702198 s : Deo mflop/s per node   2.24423e+06
 Grid : Message : 92.702209 s : #### Dhop calls report 
 Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 92.702240 s : WilsonFermion5D TotalTime   /Calls        : 9377.88 us
 Grid : Message : 92.702257 s : WilsonFermion5D CommTime    /Calls        : 8221.84 us
 Grid : Message : 92.702277 s : WilsonFermion5D FaceTime    /Calls        : 543.548 us
 Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls        : 20.936 us
 Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls        : 732.33 us
 Grid : Message : 92.702376 s : Average mflops/s per call                : 4.13001e+09
 Grid : Message : 92.702387 s : Average mflops/s per call per rank       : 1.72084e+08
 Grid : Message : 92.702397 s : Average mflops/s per call per node       : 1.0325e+09
 Grid : Message : 92.702407 s : Average mflops/s per call (full)         : 9.12937e+06
 Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391
 Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06
 Grid : Message : 92.702435 s : WilsonFermion5D Stencil
 Grid : Message : 92.702443 s : WilsonFermion5D StencilEven
 Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd
 Grid : Message : 92.702459 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 92.772983 s : r_e6.02121
 Grid : Message : 92.786384 s : r_o6.02102
 Grid : Message : 92.799622 s : res12.0422
 Grid : Message : 93.860500 s : norm diff   0
 Grid : Message : 93.162026 s : norm diff even  0
 Grid : Message : 93.197529 s : norm diff odd   0
@@ -0,0 +1,206 @@
 OPENMPI detected
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device Number    : 0
 AcceleratorCudaInit[0]: ========================
 AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
 AcceleratorCudaInit[0]:   totalGlobalMem: 16911433728 
 AcceleratorCudaInit[0]:   managedMemory: 1 
 AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
 AcceleratorCudaInit[0]:   warpSize: 32 
 AcceleratorCudaInit[0]:   pciBusID: 4 
 AcceleratorCudaInit[0]:   pciDeviceID: 0 
 AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 AcceleratorCudaInit: rank 0 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 0 device 0 bus id: 0004:04:00.0
 AcceleratorCudaInit: ================================================
 SharedMemoryMpi:  World communicator of size 24
 SharedMemoryMpi:  Node  communicator of size 6
 0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers 
 Setting up IPC
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
 __|_                                    _|__
 __|_   GGGG    RRRR    III    DDDD      _|__
 __|_  G        R   R    I     D   D     _|__
 __|_  G        R   R    I     D    D    _|__
 __|_  G  GG    RRRR     I     D    D    _|__
 __|_  G   G    R  R     I     D   D     _|__
 __|_   GGGG    R   R   III    DDDD      _|__
 __|_                                    _|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 __|__|__|__|__|__|__|__|__|__|__|__|__|__|__
  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
 Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 AcceleratorCudaInit: rank 2 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 2 device 2 bus id: 0004:06:00.0
 AcceleratorCudaInit: rank 1 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 1 device 1 bus id: 0004:05:00.0
 AcceleratorCudaInit: rank 4 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 4 device 4 bus id: 0035:04:00.0
 AcceleratorCudaInit: rank 3 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 3 device 3 bus id: 0035:03:00.0
 AcceleratorCudaInit: rank 5 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 local rank 5 device 5 bus id: 0035:05:00.0
 GNU General Public License for more details.
 Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
 Grid : Message : ================================================ 
 Grid : Message : MPI is initialised and logging filters activated 
 Grid : Message : ================================================ 
 Grid : Message : Requested 2147483648 byte stencil comms buffers 
 Grid : Message : MemoryManager Cache 8388608000 bytes 
 Grid : Message : MemoryManager::Init() setting up
 Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 Grid : Message : MemoryManager::Init() Using cudaMalloc
 Grid : Message : 1.544984 s : Grid Layout
 Grid : Message : 1.544992 s : 	Global lattice size  : 64 64 64 96 
 Grid : Message : 1.545003 s : 	OpenMP threads       : 6
 Grid : Message : 1.545011 s : 	MPI tasks            : 2 2 2 3 
 AcceleratorCudaInit: rank 8 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 6 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 11 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 16 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 17 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 13 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 12 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 21 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 23 setting device to node rank 5
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 22 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 19 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 18 setting device to node rank 0
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 7 setting device to node rank 1
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 10 setting device to node rank 4
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 9 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 14 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 15 setting device to node rank 3
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 AcceleratorCudaInit: rank 20 setting device to node rank 2
 AcceleratorCudaInit: Configure options --enable-setdevice=yes 
 Grid : Message : 2.994920 s : Making s innermost grids
 Grid : Message : 2.232502 s : Initialising 4d RNG
 Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG'
 Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 Grid : Message : 2.653140 s : Initialising 5d RNG
 Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG'
 Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 Grid : Message : 9.994738 s : Initialised RNGs
 Grid : Message : 13.153426 s : Drawing gauge field
 Grid : Message : 13.825697 s : Random gauge initialised 
 Grid : Message : 18.537657 s : Setting up Cshift based reference 
 Grid : Message : 22.296755 s : *****************************************************************
 Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 Grid : Message : 22.296791 s : *****************************************************************
 Grid : Message : 22.296800 s : *****************************************************************
 Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop                  
 Grid : Message : 22.296818 s : * Vectorising space-time by 4
 Grid : Message : 22.296828 s : * VComplexF size is 32 B
 Grid : Message : 22.296838 s : * SINGLE precision 
 Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute
 Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 22.296863 s : *****************************************************************
 Grid : Message : 24.746452 s : Called warmup
 Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us
 Grid : Message : 137.525818 s : mflop/s =   1.41383e+07
 Grid : Message : 137.525831 s : mflop/s per rank =  589097
 Grid : Message : 137.525843 s : mflop/s per node =  3.53458e+06
 Grid : Message : 137.525854 s : RF  GiB/s (base 2) =   28728.7
 Grid : Message : 137.525864 s : mem GiB/s (base 2) =   17955.5
 Grid : Message : 137.693645 s : norm diff   1.04885e-13
 Grid : Message : 137.965585 s : #### Dhop calls report 
 Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 Grid : Message : 137.965612 s : WilsonFermion5D TotalTime   /Calls        : 18899.7 us
 Grid : Message : 137.965624 s : WilsonFermion5D CommTime    /Calls        : 16041.4 us
 Grid : Message : 137.965634 s : WilsonFermion5D FaceTime    /Calls        : 859.705 us
 Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls        : 70.5881 us
 Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls        : 2094.8 us
 Grid : Message : 137.965682 s : Average mflops/s per call                : 3.87638e+09
 Grid : Message : 137.965692 s : Average mflops/s per call per rank       : 1.61516e+08
 Grid : Message : 137.965702 s : Average mflops/s per call per node       : 9.69095e+08
 Grid : Message : 137.965712 s : Average mflops/s per call (full)         : 1.43168e+07
 Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533
 Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06
 Grid : Message : 137.965740 s : WilsonFermion5D Stencil
 Grid : Message : 137.965748 s : WilsonFermion5D StencilEven
 Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd
 Grid : Message : 137.965764 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness
 Grid : Message : 156.554632 s : Called DwDag
 Grid : Message : 156.554642 s : norm dag result 12.0421
 Grid : Message : 156.639265 s : norm dag ref    12.0421
 Grid : Message : 156.888281 s : norm dag diff   7.62057e-14
 Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 Grid : Message : 158.208630 s : src_e0.499996
 Grid : Message : 158.162447 s : src_o0.500004
 Grid : Message : 158.267780 s : *********************************************************
 Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO                
 Grid : Message : 158.267801 s : * Vectorising space-time by 4
 Grid : Message : 158.267811 s : * SINGLE precision 
 Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute
 Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels
 Grid : Message : 158.267836 s : *********************************************************
 Grid : Message : 216.487829 s : Deo mflop/s =   1.37283e+07
 Grid : Message : 216.487869 s : Deo mflop/s per rank   572011
 Grid : Message : 216.487881 s : Deo mflop/s per node   3.43206e+06
 Grid : Message : 216.487893 s : #### Dhop calls report 
 Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 Grid : Message : 216.487913 s : WilsonFermion5D TotalTime   /Calls        : 19399.6 us
 Grid : Message : 216.487923 s : WilsonFermion5D CommTime    /Calls        : 16475.4 us
 Grid : Message : 216.487933 s : WilsonFermion5D FaceTime    /Calls        : 972.393 us
 Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls        : 49.8474 us
 Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls        : 2089.93 us
 Grid : Message : 216.488001 s : Average mflops/s per call                : 5.39682e+09
 Grid : Message : 216.488011 s : Average mflops/s per call per rank       : 2.24867e+08
 Grid : Message : 216.488020 s : Average mflops/s per call per node       : 1.3492e+09
 Grid : Message : 216.488030 s : Average mflops/s per call (full)         : 1.39479e+07
 Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162
 Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06
 Grid : Message : 216.488057 s : WilsonFermion5D Stencil
 Grid : Message : 216.488065 s : WilsonFermion5D StencilEven
 Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd
 Grid : Message : 216.488081 s : WilsonFermion5D Stencil     Reporti()
 Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti()
 Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd  Reporti()
 Grid : Message : 217.384495 s : r_e6.02113
 Grid : Message : 217.426121 s : r_o6.02096
 Grid : Message : 217.472636 s : res12.0421
 Grid : Message : 218.200068 s : norm diff   0
 Grid : Message : 218.645673 s : norm diff even  0
 Grid : Message : 218.816561 s : norm diff odd   0
@@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 16
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 APP="./benchmarks/Benchmark_comms_host_device  --mpi 4.4.4.3 "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
@@ -0,0 +1,25 @@
 #!/bin/bash
 #BSUB -P LGT104
 #BSUB -W 2:00
 #BSUB -nnodes 4
 #BSUB -J DWF
 export OMP_NUM_THREADS=6
 export PAMI_IBV_ADAPTER_AFFINITY=1
 export PAMI_ENABLE_STRIPING=1
 export OPT="--comms-concurrent --comms-overlap "
 #export GRID_ALLOC_NCACHE_LARGE=1
 export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node
 APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
 jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node
@@ -0,0 +1,8 @@
 export UCX_GDR_COPY_RCACHE=no
 export UCX_MEMTYPE_CACHE=n
 export UCX_RNDV_SCHEME=put_zcopy
 module load gcc/7.5.0
 module load cuda/10.2.89
 #cuda/11.4.0
 export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH 
@@ -5,7 +5,7 @@
    --enable-gen-simd-width=64 \
    --enable-accelerator=cuda \
    --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
@@ -1,2 +1,6 @@
-spack load c-lime
+module load cuda/11.4.1  openmpi/4.1.1-cuda11.4.1  ucx/1.12.0-cuda11.4.1  
-module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
+#module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1
 export PREFIX=/home/tc002/tc002/shared/env/prefix/
 export LD_LIBRARY_PATH=$PREFIX/lib/:$LD_LIBRARY_PATH
 unset SBATCH_EXPORT