diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h
index bb06d43f..a15f9789 100644
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -108,6 +107,8 @@ public:
   ////////////////////////////////////////////////////////////
   // Reduction
   ////////////////////////////////////////////////////////////
+  void GlobalMax(RealD &);
+  void GlobalMax(RealF &);
   void GlobalSum(RealF &);
   void GlobalSumVector(RealF *,int N);
   void GlobalSum(RealD &);
diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc
index c6543851..5713fe35 100644
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
   assert(ierr==0);
 }
+void CartesianCommunicator::GlobalMax(float &f)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalMax(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
   assert(ierr==0);
diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc
index 6cb431a2..beb2cc97 100644
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
 
 CartesianCommunicator::~CartesianCommunicator(){}
 
+void CartesianCommunicator::GlobalMax(float &){}
+void CartesianCommunicator::GlobalMax(double &){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index c2955485..7338fd41 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
   ssobj ret = ssum;
   return ret;
 }
+/*
+Threaded max, don't use for now
+template<class Double>
+inline Double max(const Double *arg, Integer osites)
+{
+  //  const int Nsimd = vobj::Nsimd();
+  const int nthread = GridThread::GetThreads();
 
-
+  std::vector<Double> maxarray(nthread);
+  
+  thread_for(thr,nthread, {
+    int nwork, mywork, myoff;
+    nwork = osites;
+    GridThread::GetWork(nwork,thr,mywork,myoff);
+    Double max=arg[0];
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      if( arg[ss] > max ) max = arg[ss];
+    }
+    maxarray[thr]=max;
+  });
+  
+  Double tmax=maxarray[0];
+  for(int i=0;i<nthread;i++){
+    if (maxarray[i]>tmax) tmax = maxarray[i];
+  } 
+  return tmax;
+}
+*/
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
@@ -140,6 +166,31 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
   ComplexD nrm = innerProduct(arg,arg);
   return real(nrm); 
 }
+template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
+{
+  typedef typename vobj::tensor_reduced vscalar;
+  typedef typename vobj::scalar_object  scalar;
+  typedef typename getPrecision<vobj>::real_scalar_type rscalar;
+
+  Lattice<vscalar> inner = localNorm2(arg);
+
+  auto grid = arg.Grid();
+
+  RealD max;
+  for(int l=0;l<grid->lSites();l++){
+    Coordinate coor;
+    scalar val;
+    RealD r;
+    grid->LocalIndexToLocalCoor(l,coor);
+    peekLocalSite(val,inner,coor);
+    r=real(TensorRemove(val));
+    if( (l==0) || (r>max)){
+      max=r;
+    }
+  }
+  grid->GlobalMax(max);
+  return max;
+}
 
 // Double inner product
 template<class vobj>
diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc
index d7ed04ba..d3e6bfbd 100644
--- a/tests/core/Test_main.cc
+++ b/tests/core/Test_main.cc
@@ -231,6 +231,19 @@ int main(int argc, char **argv) {
       scalar = localInnerProduct(cVec, cVec);
       scalar = localNorm2(cVec);
 
+      std::cout << "Testing maxLocalNorm2" <<std::endl;
+      for(Integer gsite=0;gsite<Fine.gSites();gsite++){
+
+	TComplex big(10.0);
+	Coordinate coor;
+
+	random(FineRNG, scalar);
+	Fine.GlobalIndexToGlobalCoor(gsite,coor);
+        pokeSite(big,scalar,coor);
+	
+	RealD Linfty = maxLocalNorm2(scalar);
+	assert(Linfty == 100.0);
+      }
       //     -=,+=,*=,()
       //     add,+,sub,-,mult,mac,*
       //     adj,conjugate
@@ -549,7 +562,8 @@ int main(int argc, char **argv) {
 
                   std::vector<int> shiftcoor = coor;
                   shiftcoor[dir] = (shiftcoor[dir] + shift + latt_size[dir]) %
-                                   (latt_size[dir] / mpi_layout[dir]);
+                                   (latt_size[dir]);
+		  //                                   (latt_size[dir] / mpi_layout[dir]);
 
                   std::vector<int> rl(4);
                   for (int dd = 0; dd < 4; dd++) {