diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
index c1a4d93a..30be510b 100644
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
 #define AccSmall (3)
 #define Shared   (4)
 #define SharedSmall (5)
+#undef GRID_MM_VERBOSE 
 uint64_t total_shared;
 uint64_t total_device;
 uint64_t total_host;;
 void MemoryManager::PrintBytes(void)
 {
-  std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
-  std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : PrintBytes "<<std::endl;
+  std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared      Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
+  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
+  uint64_t cacheBytes;
+  cacheBytes = CacheBytes[Cpu];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Acc];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
+  cacheBytes = CacheBytes[Shared];
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
+  
+#ifdef GRID_CUDA
+  cuda_mem();
+#endif
+  
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
 //////////////////////////////////////////////////////////////////////
 MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
 int MemoryManager::Victim[MemoryManager::NallocType];
-int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
-
+int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
+uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
 //////////////////////////////////////////////////////////////////////
 // Actual allocation and deallocation utils
 //////////////////////////////////////////////////////////////////////
 void *MemoryManager::AcceleratorAllocate(size_t bytes)
 {
+  total_device+=bytes;
   void *ptr = (void *) Lookup(bytes,Acc);
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocDevice(bytes);
-    total_device+=bytes;
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorAllocate "<<std::endl;
+  PrintBytes();
+#endif
   return ptr;
 }
 void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
 {
+  total_device-=bytes;
   void *__freeme = Insert(ptr,bytes,Acc);
   if ( __freeme ) {
     acceleratorFreeDevice(__freeme);
-    total_device-=bytes;
-    //    PrintBytes();
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"AcceleratorFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
 {
+  total_shared+=bytes;
   void *ptr = (void *) Lookup(bytes,Shared);
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocShared(bytes);
-    total_shared+=bytes;
-    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
-    //    PrintBytes();
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedAllocate "<<std::endl;
+  PrintBytes();
+#endif
   return ptr;
 }
 void  MemoryManager::SharedFree    (void *ptr,size_t bytes)
 {
+  total_shared-=bytes;
   void *__freeme = Insert(ptr,bytes,Shared);
   if ( __freeme ) {
     acceleratorFreeShared(__freeme);
-    total_shared-=bytes;
-    //    PrintBytes();
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"SharedFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #ifdef GRID_UVM
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
   void *ptr = (void *) Lookup(bytes,Cpu);
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocShared(bytes);
-    total_host+=bytes;
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
   return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
   NotifyDeletion(_ptr);
   void *__freeme = Insert(_ptr,bytes,Cpu);
   if ( __freeme ) { 
     acceleratorFreeShared(__freeme);
-    total_host-=bytes;
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #else
 void *MemoryManager::CpuAllocate(size_t bytes)
 {
+  total_host+=bytes;
   void *ptr = (void *) Lookup(bytes,Cpu);
   if ( ptr == (void *) NULL ) {
     ptr = (void *) acceleratorAllocCpu(bytes);
-    total_host+=bytes;
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuAllocate "<<std::endl;
+  PrintBytes();
+#endif
   return ptr;
 }
 void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
 {
+  total_host-=bytes;
   NotifyDeletion(_ptr);
   void *__freeme = Insert(_ptr,bytes,Cpu);
   if ( __freeme ) { 
     acceleratorFreeCpu(__freeme);
-    total_host-=bytes;
   }
+#ifdef GRID_MM_VERBOSE
+  std::cout <<"CpuFree "<<std::endl;
+  PrintBytes();
+#endif
 }
 #endif
 
@@ -181,13 +225,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
   int cache = type + small;
-  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);  
+  return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);  
 #else
   return ptr;
 #endif
 }
 
-void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) 
+void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) 
 {
   assert(ncache>0);
 #ifdef GRID_OMP
@@ -211,6 +255,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
 
   if ( entries[v].valid ) {
     ret = entries[v].address;
+    cacheBytes -= entries[v].bytes;
     entries[v].valid = 0;
     entries[v].address = NULL;
     entries[v].bytes = 0;
@@ -219,6 +264,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
   entries[v].address=ptr;
   entries[v].bytes  =bytes;
   entries[v].valid  =1;
+  cacheBytes += bytes;
 
   return ret;
 }
@@ -228,13 +274,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
 #ifdef ALLOCATION_CACHE
   bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
   int cache = type+small;
-  return Lookup(bytes,Entries[cache],Ncache[cache]);
+  return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
 #else
   return NULL;
 #endif
 }
 
-void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) 
+void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) 
 {
   assert(ncache>0);
 #ifdef GRID_OMP
@@ -243,6 +289,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
   for(int e=0;e<ncache;e++){
     if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
       entries[e].valid = 0;
+      cacheBytes -= entries[e].bytes;
       return entries[e].address;
     }
   }
diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h
index 25c5b5f5..eafcd83f 100644
--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@@ -82,14 +82,15 @@ private:
   static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
   static int Victim[NallocType];
   static int Ncache[NallocType];
+  static uint64_t CacheBytes[NallocType];
 
   /////////////////////////////////////////////////
   // Free pool
   /////////////////////////////////////////////////
   static void *Insert(void *ptr,size_t bytes,int type) ;
   static void *Lookup(size_t bytes,int type) ;
-  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
-  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
+  static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
+  static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
 
   static void PrintBytes(void);
  public:
diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index 275ed5e0..72111dbd 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -3,7 +3,7 @@
 
 #warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
-//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
+//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
 #define dprintf(...)
 
 
@@ -429,6 +429,7 @@ void  MemoryManager::NotifyDeletion(void *_ptr)
 }
 void  MemoryManager::Print(void)
 {
+  PrintBytes();
   std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
   std::cout << GridLogDebug << "Memory Manager                             " << std::endl;
   std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
index 52fadc0b..14e07248 100644
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -74,11 +74,13 @@ void acceleratorInit(void)
       //      GPU_PROP(singleToDoublePrecisionPerfRatio);
     }
   }
+
   MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
 #undef GPU_PROP_FMT    
 #undef GPU_PROP
 
 #ifdef GRID_DEFAULT_GPU
+  int device = 0;
   // IBM Jsrun makes cuda Device numbering screwy and not match rank
   if ( world_rank == 0 ) {
     printf("AcceleratorCudaInit: using default device \n");
@@ -87,10 +89,20 @@ void acceleratorInit(void)
     printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
   }
 #else
+  int device = rank;
   printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
   printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
-  cudaSetDevice(rank);
 #endif
+
+  cudaSetDevice(device);
+
+  const int len=64;
+  char busid[len];
+  if( rank == world_rank ) { 
+    cudaDeviceGetPCIBusId(busid, len, device);
+    printf("local rank %d device %d bus id: %s\n", rank, device, busid);
+  }
+
   if ( world_rank == 0 )  printf("AcceleratorCudaInit: ================================================\n");
 }
 #endif
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 974cc6ce..83b27429 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -115,6 +115,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #endif
 } // CUDA specific
 
+inline void cuda_mem(void)
+{
+  size_t free_t,total_t,used_t;
+  cudaMemGetInfo(&free_t,&total_t);
+  used_t=total_t-free_t;
+  std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
+}
+
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
   {									\
     int nt=acceleratorThreads();					\
@@ -125,7 +133,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
     };									\
     dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
     dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
+    std::cout << "========================== CUDA KERNEL CALL\n";	\
+    cuda_mem();								\
     LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
+    cuda_mem();								\
+    std::cout << "========================== CUDA KERNEL DONE\n";	\
   }
 
 #define accelerator_for6dNB(iter1, num1,				\
diff --git a/systems/Summit/config-command b/systems/Summit/config-command
new file mode 100644
index 00000000..b565addc
--- /dev/null
+++ b/systems/Summit/config-command
@@ -0,0 +1,14 @@
+../../configure --enable-comms=mpi \
+	      --enable-simd=GPU \
+	      --enable-gen-simd-width=32 \
+	      --enable-unified=no \
+	       --enable-shm=nvlink \
+	       --disable-gparity \
+	       --enable-setdevice \
+	       --disable-fermion-reps \
+	       --enable-accelerator=cuda \
+	       --prefix /ccs/home/paboyle/prefix \
+	       CXX=nvcc \
+	       LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
+	       CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
+
diff --git a/systems/Summit/dwf16.lsf b/systems/Summit/dwf16.lsf
new file mode 100644
index 00000000..16f4b82d
--- /dev/null
+++ b/systems/Summit/dwf16.lsf
@@ -0,0 +1,25 @@
+#!/bin/bash
+#BSUB -P LGT104
+#BSUB -W 2:00
+#BSUB -nnodes 4
+#BSUB -J DWF
+
+export OMP_NUM_THREADS=6
+export PAMI_IBV_ADAPTER_AFFINITY=1
+export PAMI_ENABLE_STRIPING=1
+export OPT="--comms-concurrent --comms-overlap "
+
+APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+
+
+
+
+
diff --git a/systems/Summit/dwf4.lsf b/systems/Summit/dwf4.lsf
new file mode 100644
index 00000000..fcd80bcb
--- /dev/null
+++ b/systems/Summit/dwf4.lsf
@@ -0,0 +1,25 @@
+#!/bin/bash
+#BSUB -P LGT104
+#BSUB -W 2:00
+#BSUB -nnodes 4
+#BSUB -J DWF
+
+export OMP_NUM_THREADS=6
+export PAMI_IBV_ADAPTER_AFFINITY=1
+export PAMI_ENABLE_STRIPING=1
+export OPT="--comms-concurrent --comms-overlap "
+#export GRID_ALLOC_NCACHE_LARGE=1
+export APP="./benchmarks/Benchmark_comms_host_device  --mpi 2.2.2.3 "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 1024 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
+jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP
+
+
+
+
+
+
diff --git a/systems/Summit/sourceme-cuda10.sh b/systems/Summit/sourceme-cuda10.sh
new file mode 100644
index 00000000..58217613
--- /dev/null
+++ b/systems/Summit/sourceme-cuda10.sh
@@ -0,0 +1,8 @@
+export UCX_GDR_COPY_RCACHE=no
+export UCX_MEMTYPE_CACHE=n
+export UCX_RNDV_SCHEME=put_zcopy
+module load gcc/7.5.0
+module load cuda/10.2.89
+#cuda/11.4.0
+export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH 
+