diff --git a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
index fcd79d5f..fc464ea1 100644
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -62,6 +62,7 @@ public:
   
   std::vector<deviceVector<calcMatrix> > _A;
   std::vector<CoarseVector> MultTemporaries;
+  deviceVector<GeneralStencilEntryReordered> StencilMasked;
 
   ///////////////////////
   // Interface
@@ -78,9 +79,40 @@ public:
     Stencil(Cell.grids.back(),geom.shifts)
   {
     _A.resize(geom.npoint);
+    int32_t padded_sites = _Op._A[0].Grid()->lSites();
     for(int p=0;p<geom.npoint;p++){
-      _A[p].resize(_CoarseGrid->lSites());
+      _A[p].resize(padded_sites);
     }
+    std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
+
+    StencilMasked.resize(_CoarseGridMulti->oSites());
+    std::vector<GeneralStencilEntryReordered> StencilTmp;
+
+    int32_t j=0;
+    int32_t sites = Stencil._entries.size()/geom.npoint;
+    for(int32_t s=0;s<sites;s++){
+      int ghost_zone=0;
+      for(int32_t point = 0 ; point < geom.npoint; point++){
+	int i=s*geom.npoint+point;
+	if( Stencil._entries[i]._permute ) {
+	  ghost_zone=1;
+	}
+      }
+      GeneralStencilEntryReordered tmp;
+      if( ghost_zone==0) {
+	for(int32_t point = 0 ; point < geom.npoint; point++){
+	  int i=s*geom.npoint+point;
+	  tmp._offset = Stencil._entries[i]._offset;
+	  tmp._permute= Stencil._entries[i]._permute;
+	  tmp._output = j;
+	  StencilTmp.push_back(tmp);
+	}
+	j++;
+      }
+    }
+    std::cout << "coarse osites x npoint "<<_CoarseGridMulti->oSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<<std::endl;
+    assert(_CoarseGridMulti->lSites()*geom.npoint==StencilTmp.size());
+    acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
     CopyMatrix();
   }
   void CopyMatrix (void)
@@ -100,12 +132,18 @@ public:
   }
   void M (const CoarseVector &in, CoarseVector &out)
   {
+    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+    RealD tmult2=0;
+
+    ttot=-usecond();
     conformable(CoarseGrid(),in.Grid());
     conformable(in.Grid(),out.Grid());
     out.Checkerboard() = in.Checkerboard();
     CoarseVector tin=in;
 
+    texch-=usecond();
     CoarseVector pin = Cell.ExchangePeriodic(tin);
+    texch+=usecond();
     CoarseVector pout(pin.Grid());
 
     int npoint = geom.npoint;
@@ -116,22 +154,33 @@ public:
     
     int64_t osites=pin.Grid()->oSites();
     int64_t nrhs  =pin.Grid()->GlobalDimensions()[0]/Nsimd;
+    assert(nrhs>=1);
 
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+                + 2.0*osites*sizeof(siteVector)*npoint;
+    
     {
+      tviews-=usecond();
       autoView( in_v , pin, AcceleratorRead);
       autoView( out_v , pout, AcceleratorWriteDiscard);
       autoView( Stencil_v  , Stencil, AcceleratorRead);
+      tviews+=usecond();
 
       // Static and prereserve to keep UVM region live and not resized across multiple calls
+      ttemps-=usecond();
       MultTemporaries.resize(npoint,pin.Grid());       
+      ttemps+=usecond();
 
       std::vector<Aview> AcceleratorViewContainer_h;
       std::vector<Vview> AcceleratorVecViewContainer_h; 
 
+      tviews-=usecond();
       for(int p=0;p<npoint;p++) {
 	AcceleratorViewContainer_h.push_back( &_A[p][0]);
 	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
       }
+      tviews+=usecond();
 
       static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
       static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
@@ -139,15 +188,23 @@ public:
       auto Aview_p = &AcceleratorViewContainer[0];
       auto Vview_p = &AcceleratorVecViewContainer[0];
 
+      tcopy-=usecond();
       acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
       acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
+      tcopy+=usecond();
 
+      int32_t bound = _A[0].size();
+      std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
+      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
+      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
+      tmult-=usecond();
       accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int32_t ss   = rspb/(nbasis*npoint);
 	  int32_t bp   = rspb%(nbasis*npoint);
 	  int32_t point= bp/nbasis;
 	  int32_t b    = bp%nbasis;
+	  assert(ss<bound);
 	  auto SE  = Stencil_v.GetEntry(point,ss);
 	  if ( SE->_permute == 0 ) { 
 	    int32_t snbr= SE->_offset;
@@ -159,6 +216,7 @@ public:
 	    coalescedWrite(Vview_p[point][ss](b),res);
 	  }
       });
+      tmult2-=usecond();
       accelerator_for(sb, osites*nbasis, Nsimd, {
 	  int ss = sb/nbasis;
 	  int b  = sb%nbasis;
@@ -168,13 +226,32 @@ public:
 	  }
 	  coalescedWrite(out_v[ss](b),res);
       });
+      tmult2+=usecond();
+      tmult+=usecond();
       for(int p=0;p<npoint;p++) {
 	AcceleratorVecViewContainer_h[p].ViewClose();
       }
     }
 
+    text-=usecond();
     out = Cell.Extract(pout);
+    text+=usecond();
+    ttot+=usecond();
 
+    std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogMessage<<" of which mult2  "<<tmult2<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    //    std::cout << GridLogMessage<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    
   };
   virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 38501d3d..0521757d 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -745,8 +745,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_type vector_type;
 
-  static const int words=sizeof(vobj)/sizeof(vector_type);
-
   GridBase *Fg = From.Grid();
   GridBase *Tg = To.Grid();
   assert(!Fg->_isCheckerBoarded);
@@ -763,13 +761,14 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
   // the above should guarantee that the operations are local
   
 #if 1
-
   size_t nsite = 1;
   for(int i=0;i<nd;i++) nsite *= RegionSize[i];
   
   size_t tbytes = 4*nsite*sizeof(int);
   int *table = (int*)malloc(tbytes);
- 
+
+  RealD t_cpu=-usecond();
+#if 0
   thread_for(idx, nsite, {
       Coordinate from_coor, to_coor;
       size_t rem = idx;
@@ -792,15 +791,44 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
   
   int* table_d = (int*)acceleratorAllocDevice(tbytes);
   acceleratorCopyToDevice(table,table_d,tbytes);
+#else
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+
+  accelerator_for(idx, nsite, 1, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      int foidx = 0; for(int d=0;d<nd;d++) foidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int fiidx = 0; for(int d=0;d<nd;d++) fiidx+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int toidx = 0; for(int d=0;d<nd;d++) toidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int tiidx = 0; for(int d=0;d<nd;d++) tiidx+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+      int* tt = table_d + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+#endif
+  t_cpu+=usecond();
 
   typedef typename vobj::vector_type vector_type;
   typedef typename vobj::scalar_type scalar_type;
 
   autoView(from_v,From,AcceleratorRead);
   autoView(to_v,To,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
+  RealD t_acc=-usecond();
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,words,{
       int* tt = table_d + 4*idx;
       int from_oidx = *tt++;
       int from_lane = *tt++;
@@ -811,12 +839,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
       vector_type* to = (vector_type *)&to_v[to_oidx];
       
       scalar_type stmp;
+#ifdef GRID_SIMT
+      int w = acceleratorSIMTlane(words);
+      stmp = getlane(from[w], from_lane);
+      putlane(to[w], stmp, to_lane);
+#else
       for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
       }
+#endif
     });
-  
+  t_acc+=usecond();
+  std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
+  std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
   acceleratorFreeDevice(table_d);    
   free(table);
   
diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h
index 7f092f62..e573049d 100644
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -403,18 +403,8 @@ public:
       double t = usecond();
       padded = in;
       tins += usecond() - t;
-      
+      // return in; ?
     } else {
-
-      //////////////////////////////////////////////
-      // Replace sequence with
-      // ---------------------
-      // (i) Gather high face(s); start comms
-      // (ii) Gather low  face(s); start comms
-      // (iii) Copy middle bit with localCopyRegion
-      // (iv) Complete high face(s), insert slice(s)
-      // (iv) Complete low  face(s), insert slice(s)
-      //////////////////////////////////////////////
       Face_exchange(in,padded,dim,depth);
     }
     return padded;
@@ -482,6 +472,7 @@ public:
     // Gather all surface terms up to depth "d"
     ////////////////////////////////////////////////////////////////////////////
     RealD t;
+    RealD t_tot=-usecond();
     int plane=0;
     for ( int d=0;d < depth ; d ++ ) {
       int tag = d*1024 + dimension*2+0;
@@ -549,6 +540,7 @@ public:
     }
     t_scatter+= usecond() - t;
     //    DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
+    t_tot+=usecond();
 
     //DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
     std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
@@ -557,6 +549,7 @@ public:
     //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
     std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
     std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
     //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
   }
   
diff --git a/Grid/log/Log.cc b/Grid/log/Log.cc
index 166aea0a..6874620f 100644
--- a/Grid/log/Log.cc
+++ b/Grid/log/Log.cc
@@ -90,7 +90,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 
   for (int i = 0; i < logstreams.size(); i++) {
     if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
-    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
+    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(0);
     if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
     if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
     if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h
index 04eaadb7..1a36ad43 100644
--- a/Grid/stencil/GeneralLocalStencil.h
+++ b/Grid/stencil/GeneralLocalStencil.h
@@ -33,6 +33,10 @@ struct GeneralStencilEntry {
   uint64_t _offset;            // 4 bytes 
   uint8_t _permute;            // 1 bytes // Horrible alignment properties
 };
+struct GeneralStencilEntryReordered : public GeneralStencilEntry {
+  uint64_t _output;
+};
+
 // Could pack to 8 + 4 + 4 = 128 bit and use 
 
 class GeneralLocalStencilView {
diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command
index 41335bd9..53b8e822 100644
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@@ -6,6 +6,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
+--enable-tracing=roctx \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
diff --git a/tests/debug/Test_general_coarse.cc b/tests/debug/Test_general_coarse.cc
index f93f5790..f980d7c7 100644
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@@ -78,7 +78,7 @@ int main (int argc, char ** argv)
   // Construct a coarsened grid
   Coordinate clatt = GridDefaultLatt();
   for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/2;
+    clatt[d] = clatt[d]/4;
   }
 
   GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)
 
   DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
-  const int nbasis = 16;
+  const int nbasis = 32;
   const int cb = 0 ;
   LatticeFermion prom(FGrid);
 
@@ -265,8 +265,8 @@ int main (int argc, char ** argv)
 	LittleDiracOp.M(phi,Aphi);
       }
       t1+=usecond();
-      std::cout << r << " mrhs " << norm2(chi)<<std::endl;
-      std::cout << r << " srhs " << norm2(Aphi)<<std::endl;
+      std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
+      std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
       chi=chi-Aphi;
       std::cout << r << " diff " << norm2(chi)<<std::endl;
     }