diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h
index b87f4cb3..86971fdf 100644
--- a/Grid/algorithms/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/GeneralCoarsenedMatrix.h
@@ -300,10 +300,10 @@ public:
     const int Nsimd = CComplex::Nsimd();
     
     int osites=pin.Grid()->oSites();
-    int gsites=pin.Grid()->gSites();
+    //    int gsites=pin.Grid()->gSites();
 
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
+    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
       
     //    for(int point=0;point<npoint;point++){
     //      conformable(A[point],pin);
@@ -358,7 +358,7 @@ public:
 
   void PopulateAdag(void)
   {
-    for(int bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
+    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
       Coordinate bcoor;
       CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
       
@@ -543,10 +543,13 @@ public:
     }
 
     // Only needed if nonhermitian
-    if ( ! hermitian )
+    if ( ! hermitian ) {
+      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
       PopulateAdag();
+    }
 
     // Need to write something to populate Adag from A
+    std::cout << GridLogMessage<<"ExchangeCoarseLinks  "<<std::endl;
     ExchangeCoarseLinks();
     std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
     std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
@@ -556,6 +559,7 @@ public:
   }
   void ExchangeCoarseLinks(void){
     for(int p=0;p<geom.npoint;p++){
+      std::cout << "Exchange "<<p<<std::endl;
       _A[p] = Cell.Exchange(_A[p]);
       _Adag[p]= Cell.Exchange(_Adag[p]);
     }
diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h
index ae1fd1fd..bb3c3b3f 100644
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -70,8 +70,8 @@ public:
   Coordinate _istride;    // Inner stride i.e. within simd lane
   int _osites;                  // _isites*_osites = product(dimensions).
   int _isites;
-  int _fsites;                  // _isites*_osites = product(dimensions).
-  int _gsites;
+  int64_t _fsites;                  // _isites*_osites = product(dimensions).
+  int64_t _gsites;
   Coordinate _slice_block;// subslice information
   Coordinate _slice_stride;
   Coordinate _slice_nblock;
@@ -183,7 +183,7 @@ public:
   inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
   inline int oSites(void) const { return _osites; };
   inline int lSites(void) const { return _isites*_osites; }; 
-  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
   inline int Nd    (void) const { return _ndimension;};
 
   inline const Coordinate LocalStarts(void)             { return _lstart;    };
@@ -214,7 +214,7 @@ public:
   ////////////////////////////////////////////////////////////////
   // Global addressing
   ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
     assert(gidx< gSites());
     Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
   }
@@ -222,7 +222,7 @@ public:
     assert(lidx<lSites());
     Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
   }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
     gidx=0;
     int mult=1;
     for(int mu=0;mu<_ndimension;mu++) {
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index b0b759b5..c293a3e1 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -360,7 +360,7 @@ public:
 
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
   typedef typename vobj::scalar_object sobj;
-  for(int g=0;g<o.Grid()->_gsites;g++){
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){
 
     Coordinate gcoor;
     o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h
index c9f6aa52..a19edf00 100644
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -432,7 +432,7 @@ public:
 #if 1
     thread_for( lidx, _grid->lSites(), {
 
-	int gidx;
+	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index f22b7001..a2e4982e 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -1054,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
 
   Coordinate fcoor(nd);
   Coordinate ccoor(nd);
-  for(int g=0;g<fg->gSites();g++){
+  for(int64_t g=0;g<fg->gSites();g++){
 
     fg->GlobalIndexToGlobalCoor(g,fcoor);
     for(int d=0;d<nd;d++){
diff --git a/Grid/util/Lexicographic.h b/Grid/util/Lexicographic.h
index ff98c6fc..422e42ee 100644
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@@ -8,7 +8,7 @@ namespace Grid{
   public:
 
     template<class coor_t>
-    static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
+    static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
       int nd= dims.size();
       coor.resize(nd);
       for(int d=0;d<nd;d++){
@@ -18,28 +18,45 @@ namespace Grid{
     }
 
     template<class coor_t>
-    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
       int nd=dims.size();
       int stride=1;
       index=0;
       for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
       }
     }
+    template<class coor_t>
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoor(coor,index64,dims);
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }
 
     template<class coor_t>
-    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
       int nd=dims.size();
       int stride=1;
       index=0;
       for(int d=nd-1;d>=0;d--){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
       }
     }
     template<class coor_t>
-    static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoorReversed(coor,index64,dims);
+      if ( index64>=2*1024*1024*1024LL ){
+	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+      }
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }
+    template<class coor_t>
+    static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
       int nd= dims.size();
       coor.resize(nd);
       for(int d=nd-1;d>=0;d--){
diff --git a/systems/Frontier/benchmarks/bench2.slurm b/systems/Frontier/benchmarks/bench2.slurm
new file mode 100755
index 00000000..cc82de79
--- /dev/null
+++ b/systems/Frontier/benchmarks/bench2.slurm
@@ -0,0 +1,43 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench
+##SBATCH --partition=small-g
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:10:00
+#SBATCH --account=phy157_dwf
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=$HOME/Frontier/Grid/systems/Frontier/
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 32.32.32.64
+do
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+done
+
diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command
new file mode 100644
index 00000000..b932ba7f
--- /dev/null
+++ b/systems/Frontier/config-command
@@ -0,0 +1,23 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=timer \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--enable-accelerator-cshift \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
+
+
+
+
diff --git a/systems/Frontier/mpiwrapper.sh b/systems/Frontier/mpiwrapper.sh
new file mode 100755
index 00000000..f6a56698
--- /dev/null
+++ b/systems/Frontier/mpiwrapper.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+lrank=$SLURM_LOCALID
+lgpu=(0 1 2 3 7 6 5 4)
+
+export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
+
+echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
+
+$*
+
+
+
diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh
new file mode 100644
index 00000000..987241b4
--- /dev/null
+++ b/systems/Frontier/sourceme.sh
@@ -0,0 +1,13 @@
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+spack load c-lime
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
+module load emacs 
+module load PrgEnv-gnu
+module load rocm
+module load cray-mpich/8.1.23
+module load gmp
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+#Hack for lib
+#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
diff --git a/systems/Frontier/wrap.sh b/systems/Frontier/wrap.sh
new file mode 100755
index 00000000..eb58353c
--- /dev/null
+++ b/systems/Frontier/wrap.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
+unset ROCR_VISIBLE_DEVICES
+
+#rank=$SLURM_PROCID
+#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
+
+$@
diff --git a/tests/debug/Test_general_coarse_hdcg.cc b/tests/debug/Test_general_coarse_hdcg.cc
index becb7e51..2fe0b90a 100644
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -305,10 +305,6 @@ int main (int argc, char ** argv)
   //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
   std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
 
-  // Standard CG
-  //      result=Zero();
-  //      CGfine(HermOpEO, src, result);
-
   for(int l=0;l<los.size();l++){
 
     RealD lo = los[l];
@@ -348,6 +344,10 @@ int main (int argc, char ** argv)
       
     }
   }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
   
   Grid_finalize();
   return 0;