From c9c073eee41622c87d89d2e89de86aa33e38bdbd Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 23 Mar 2018 11:27:56 +0000
Subject: [PATCH 01/28] Changes in messages in test dwf mixedprec

---
 tests/Test_dwf_mixedcg_prec.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc
index a53d8921..92567b6f 100644
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -49,6 +49,8 @@ int main (int argc, char ** argv)
 
   const int Ls=8;
 
+  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
+
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -90,24 +92,23 @@ int main (int argc, char ** argv)
   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
   SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 
-  std::cout << "Starting mixed CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
   MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
   mCG(src_o,result_o);
 
-  std::cout << "Starting regular CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
   CG(HermOpEO,src_o,result_o_2);
 
   LatticeFermionD diff_o(FrbGrid);
   RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 
-  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
 
   #ifdef HAVE_LIME
   if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
   
   std::string file1("./Propagator1");
-  std::string file2("./Propagator2");
   emptyUserRecord record;
   uint32_t nersc_csum;
   uint32_t scidac_csuma;
@@ -121,12 +122,12 @@ int main (int argc, char ** argv)
   BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);
 
-  std::cout << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
 
   BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);
 
-  std::cout << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
   }
   #endif
 

From a1be53332956d4eb074632ef10a1449a0aac582e Mon Sep 17 00:00:00 2001
From: Guido Cossu <gcossu@cori05.nersc.gov>
Date: Tue, 24 Apr 2018 01:19:53 -0700
Subject: [PATCH 02/28] Corrected Flop count in Benchmark su3 and expanded the
 Wilson flow output

---
 benchmarks/Benchmark_su3.cc   | 4 ++--
 lib/qcd/smearing/WilsonFlow.h | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 628ad5bd..5f088fdc 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -166,7 +166,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000.0;
       
       double bytes=3*vol*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(8+8+8)*vol;
+      double flops=Nc*Nc*(6+8+8)*vol;
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
 
     }
@@ -199,7 +199,7 @@ int main (int argc, char ** argv)
 	double time = (stop-start)/Nloop*1000.0;
 	
 	double bytes=3*vol*Nc*Nc*sizeof(Complex);
-	double flops=Nc*Nc*(8+8+8)*vol;
+	double flops=Nc*Nc*(6+8+8)*vol;
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
     }
diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h
index 4f5c0d43..1c0855ab 100644
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -173,8 +173,8 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
         std::cout << "Time to evolve " << diff.count() << " s\n";
         #endif
         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << "  "
-            << energyDensityPlaquette(step,out) << std::endl;
+		  << step << "  " << tau(step) << "  " 
+		  << energyDensityPlaquette(step,out) << std::endl;
          if( step % measure_interval == 0){
          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
             << step << "  " 
@@ -193,8 +193,8 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
         //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
         evolve_step_adaptive(out, maxTau);
         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << "  "
-            << energyDensityPlaquette(out) << std::endl;
+		  << step << "  " << taus << "  "
+		  << energyDensityPlaquette(out) << std::endl;
          if( step % measure_interval == 0){
          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
             << step << "  " 

From c5b9147b5334be7f7996e8814df6762f0b9ae1e9 Mon Sep 17 00:00:00 2001
From: Guido Cossu <gcossu@cori02.nersc.gov>
Date: Tue, 24 Apr 2018 08:03:57 -0700
Subject: [PATCH 03/28] Correction of a minor bug in the su3 benchmark

---
 benchmarks/Benchmark_su3.cc | 52 ++++++++++++++++++-------------------
 lib/cshift/Cshift_mpi.h     | 13 +++++++---
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 5f088fdc..b31af942 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -52,7 +52,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=4;lat<=LMAX;lat+=LINC){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -147,30 +147,30 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
   for(int lat=2;lat<=LMAX;lat+=LINC){
-
-      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-
-      LatticeColourMatrix z(&Grid); random(pRNG,z);
-      LatticeColourMatrix x(&Grid); random(pRNG,x);
-      LatticeColourMatrix y(&Grid); random(pRNG,y);
-
-      double start=usecond();
-      for(int64_t i=0;i<Nloop;i++){
-	mac(z,x,y);
-      }
-      double stop=usecond();
-      double time = (stop-start)/Nloop*1000.0;
-      
-      double bytes=3*vol*Nc*Nc*sizeof(Complex);
-      double flops=Nc*Nc*(6+8+8)*vol;
-      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
-
+    
+    std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+    int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+    
+    GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+    GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+    
+    LatticeColourMatrix z(&Grid); random(pRNG,z);
+    LatticeColourMatrix x(&Grid); random(pRNG,x);
+    LatticeColourMatrix y(&Grid); random(pRNG,y);
+    
+    double start=usecond();
+    for(int64_t i=0;i<Nloop;i++){
+      mac(z,x,y);
     }
-
+    double stop=usecond();
+    double time = (stop-start)/Nloop*1000.0;
+    
+    double bytes=3*vol*Nc*Nc*sizeof(Complex);
+    double flops=Nc*Nc*(6+8+8)*vol;
+    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+    
+  }
+  
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
@@ -179,7 +179,6 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
   for(int lat=2;lat<=LMAX;lat+=LINC){
-
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
@@ -190,7 +189,7 @@ int main (int argc, char ** argv)
       LatticeColourMatrix x(&Grid); random(pRNG,x);
       LatticeColourMatrix y(&Grid); random(pRNG,y);
 
-      for(int mu=0;mu<=4;mu++){
+      for(int mu=0;mu<4;mu++){
 	double start=usecond();
 	for(int64_t i=0;i<Nloop;i++){
 	  z = PeriodicBC::CovShiftForward(x,mu,y);
@@ -198,6 +197,7 @@ int main (int argc, char ** argv)
 	double stop=usecond();
 	double time = (stop-start)/Nloop*1000.0;
 	
+	
 	double bytes=3*vol*Nc*Nc*sizeof(Complex);
 	double flops=Nc*Nc*(6+8+8)*vol;
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h
index a66b49bf..98972135 100644
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
 
 
   if ( !comm_dim ) {
-    //    std::cout << "Cshift_local" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
   } else if ( splice_dim ) {
-    //    std::cout << "Cshift_comms_simd" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift);
   } else {
-    //    std::cout << "Cshift_comms" <<std::endl;
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
     Cshift_comms(ret,rhs,dimension,shift);
   }
   return ret;
@@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
 
+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
   if ( sshift[0] == sshift[1] ) {
+    //std::cout << "Single pass Cshift_comms" <<std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
   } else {
+    //std::cout << "Two pass Cshift_comms" <<std::endl;
     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
   }
@@ -175,6 +178,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   int simd_layout     = grid->_simd_layout[dimension];
   int comm_dim        = grid->_processors[dimension] >1 ;
 
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
   assert(comm_dim==1);
   assert(simd_layout==2);
   assert(shift>=0);

From 276a2353dfca4677c687effc524b36b5ccfb054b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 25 Apr 2018 00:11:07 +0100
Subject: [PATCH 04/28] Move constructor

---
 lib/lattice/Lattice_base.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index 014e443d..dcd55702 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -257,7 +257,11 @@ public:
     }  	
   }
   
-  
+  Lattice(Lattice&& r){ // move constructor
+    _grid = r._grid;
+    checkerboard = r.checkerboard;
+    _odata=std::move(r._odata);
+  }
   
   virtual ~Lattice(void) = default;
     
@@ -286,6 +290,24 @@ public:
     }
     return *this;
   }
+
+  strong_inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+    return *this;
+  }
+  strong_inline Lattice<vobj> & operator = (Lattice<vobj> && r)
+  {
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata       =std::move(r._odata);
+    return *this;
+  }
   
   // *=,+=,-= operators inherit behvour from correspond */+/- operation
   template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {

From 362ba0443ad73dc726fd45bcf0f7b3447ec9fb11 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 25 Apr 2018 00:12:11 +0100
Subject: [PATCH 05/28] Cshift updates

---
 benchmarks/Benchmark_su3.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 628ad5bd..b458d48a 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,17 +35,18 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
-#define LMAX (40)
+#define LMAX (16)
+#define LMIN (16)
 #define LINC (4)
 
-  int64_t Nloop=20;
+  int64_t Nloop=2000;
 
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
   int64_t threads = GridThread::GetThreads();
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
+#if 1
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -171,14 +172,14 @@ int main (int argc, char ** argv)
 
     }
 
-
+#endif
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -190,7 +191,7 @@ int main (int argc, char ** argv)
       LatticeColourMatrix x(&Grid); random(pRNG,x);
       LatticeColourMatrix y(&Grid); random(pRNG,y);
 
-      for(int mu=0;mu<=4;mu++){
+      for(int mu=0;mu<4;mu++){
 	double start=usecond();
 	for(int64_t i=0;i<Nloop;i++){
 	  z = PeriodicBC::CovShiftForward(x,mu,y);
@@ -202,7 +203,7 @@ int main (int argc, char ** argv)
 	double flops=Nc*Nc*(8+8+8)*vol;
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
-    }
+  }
 
   Grid_finalize();
 }

From cc6eb51e3ea83f53f49edcb6c648570ff08717fe Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Wed, 25 Apr 2018 16:49:14 +0100
Subject: [PATCH 06/28] Hadrons: macro refactoring for library portability

---
 extras/Hadrons/Application.cc                    |  8 ++++----
 extras/Hadrons/EigenPack.hpp                     |  2 +-
 extras/Hadrons/Environment.cc                    | 16 ++++++++--------
 extras/Hadrons/Environment.hpp                   | 12 ++++++------
 extras/Hadrons/Exceptions.hpp                    |  8 ++++----
 extras/Hadrons/Factory.hpp                       |  2 +-
 extras/Hadrons/Global.hpp                        |  2 +-
 extras/Hadrons/Graph.hpp                         | 12 ++++++------
 extras/Hadrons/Module.cc                         |  2 +-
 extras/Hadrons/Module.hpp                        |  2 +-
 .../Modules/MContraction/WardIdentity.hpp        |  2 +-
 extras/Hadrons/Modules/MFermion/GaugeProp.hpp    |  2 +-
 extras/Hadrons/Modules/MScalarSUN/Div.hpp        |  2 +-
 extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp |  4 ++--
 extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp      |  2 +-
 extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp   |  2 +-
 extras/Hadrons/Modules/MScalarSUN/Utils.hpp      |  8 ++++----
 extras/Hadrons/Modules/MSolver/RBPrecCG.hpp      |  2 +-
 .../Modules/MUtilities/TestSeqConserved.hpp      |  2 +-
 extras/Hadrons/VirtualMachine.cc                 | 14 +++++++-------
 extras/Hadrons/VirtualMachine.hpp                |  2 +-
 lib/qcd/modules/Factory.h                        |  2 +-
 22 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/extras/Hadrons/Application.cc b/extras/Hadrons/Application.cc
index 4c7e71f2..f409d8eb 100644
--- a/extras/Hadrons/Application.cc
+++ b/extras/Hadrons/Application.cc
@@ -114,12 +114,12 @@ void Application::parseParameterFile(const std::string parameterFileName)
     setPar(par);
     if (!push(reader, "modules"))
     {
-        HADRON_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
+        HADRONS_ERROR(Parsing, "Cannot open node 'modules' in parameter file '" 
                               + parameterFileName + "'");
     }
     if (!push(reader, "module"))
     {
-        HADRON_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
+        HADRONS_ERROR(Parsing, "Cannot open node 'modules/module' in parameter file '" 
                               + parameterFileName + "'");
     }
     do
@@ -177,7 +177,7 @@ void Application::saveSchedule(const std::string filename)
         
         if (!scheduled_)
         {
-            HADRON_ERROR(Definition, "Computation not scheduled");
+            HADRONS_ERROR(Definition, "Computation not scheduled");
         }
 
         for (auto address: program_)
@@ -208,7 +208,7 @@ void Application::printSchedule(void)
 {
     if (!scheduled_)
     {
-        HADRON_ERROR(Definition, "Computation not scheduled");
+        HADRONS_ERROR(Definition, "Computation not scheduled");
     }
     auto peak = vm().memoryNeeded(program_);
     LOG(Message) << "Schedule (memory needed: " << sizeString(peak) << "):"
diff --git a/extras/Hadrons/EigenPack.hpp b/extras/Hadrons/EigenPack.hpp
index 5f403595..976fba7b 100644
--- a/extras/Hadrons/EigenPack.hpp
+++ b/extras/Hadrons/EigenPack.hpp
@@ -123,7 +123,7 @@ protected:
             binReader.readScidacFieldRecord(evec[k], vecRecord);
             if (vecRecord.index != k)
             {
-                HADRON_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
+                HADRONS_ERROR(Io, "Eigenvector " + std::to_string(k) + " has a"
                              + " wrong index (expected " + std::to_string(vecRecord.index) 
                              + ") in file '" + filename + "'");
             }
diff --git a/extras/Hadrons/Environment.cc b/extras/Hadrons/Environment.cc
index 35bb4648..990a717e 100644
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -35,7 +35,7 @@ using namespace QCD;
 using namespace Hadrons;
 
 #define ERROR_NO_ADDRESS(address)\
-HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
+HADRONS_ERROR(Definition, "no object with address " + std::to_string(address));
 
 /******************************************************************************
  *                       Environment implementation                           *
@@ -85,7 +85,7 @@ void Environment::createCoarseGrid(const std::vector<int> &blockSize,
         coarseDim[d] = fineDim[d]/blockSize[d];
         if (coarseDim[d]*blockSize[d] != fineDim[d])
         {
-            HADRON_ERROR(Size, "Fine dimension " + std::to_string(d) 
+            HADRONS_ERROR(Size, "Fine dimension " + std::to_string(d) 
                          + " (" + std::to_string(fineDim[d]) 
                          + ") not divisible by coarse dimension ("
                          + std::to_string(coarseDim[d]) + ")"); 
@@ -96,7 +96,7 @@ void Environment::createCoarseGrid(const std::vector<int> &blockSize,
         cLs = Ls/blockSize[nd];
         if (cLs*blockSize[nd] != Ls)
         {
-            HADRON_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
+            HADRONS_ERROR(Size, "Fine Ls (" + std::to_string(Ls) 
                          + ") not divisible by coarse Ls ("
                          + std::to_string(cLs) + ")");
         }
@@ -128,7 +128,7 @@ GridCartesian * Environment::getGrid(const unsigned int Ls) const
     }
     catch(std::out_of_range &)
     {
-        HADRON_ERROR(Definition, "no grid with Ls= " + std::to_string(Ls));
+        HADRONS_ERROR(Definition, "no grid with Ls= " + std::to_string(Ls));
     }
 }
 
@@ -147,7 +147,7 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
     }
     catch(std::out_of_range &)
     {
-        HADRON_ERROR(Definition, "no red-black grid with Ls= " + std::to_string(Ls));
+        HADRONS_ERROR(Definition, "no red-black grid with Ls= " + std::to_string(Ls));
     }
 }
 
@@ -171,7 +171,7 @@ GridCartesian * Environment::getCoarseGrid(
     }
     catch(std::out_of_range &)
     {
-        HADRON_ERROR(Definition, "no coarse grid with Ls= " + std::to_string(Ls));
+        HADRONS_ERROR(Definition, "no coarse grid with Ls= " + std::to_string(Ls));
     }
 }
 
@@ -221,7 +221,7 @@ void Environment::addObject(const std::string name, const int moduleAddress)
     }
     else
     {
-        HADRON_ERROR(Definition, "object '" + name + "' already exists");
+        HADRONS_ERROR(Definition, "object '" + name + "' already exists");
     }
 }
 
@@ -244,7 +244,7 @@ unsigned int Environment::getObjectAddress(const std::string name) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no object with name '" + name + "'");
+        HADRONS_ERROR(Definition, "no object with name '" + name + "'");
     }
 }
 
diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp
index f624f36e..a9c3c724 100644
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -245,7 +245,7 @@ void Environment::createDerivedObject(const std::string name,
              (object_[address].type        != &typeid(B))     or
              (object_[address].derivedType != &typeid(T)))
     {
-        HADRON_ERROR(Definition, "object '" + name + "' already allocated");
+        HADRONS_ERROR(Definition, "object '" + name + "' already allocated");
     }
 }
 
@@ -279,7 +279,7 @@ T * Environment::getDerivedObject(const unsigned int address) const
                     }
                     else
                     {
-                        HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
+                        HADRONS_ERROR(Definition, "object with address " + std::to_string(address) +
                             " cannot be casted to '" + typeName(&typeid(T)) +
                             "' (has type '" + typeName(&typeid(h->get())) + "')");
                     }
@@ -287,20 +287,20 @@ T * Environment::getDerivedObject(const unsigned int address) const
             }
             else
             {
-                HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
+                HADRONS_ERROR(Definition, "object with address " + std::to_string(address) +
                             " does not have type '" + typeName(&typeid(B)) +
                             "' (has type '" + getObjectType(address) + "')");
             }
         }
         else
         {
-            HADRON_ERROR(Definition, "object with address " + std::to_string(address) +
+            HADRONS_ERROR(Definition, "object with address " + std::to_string(address) +
                          " is empty");
         }
     }
     else
     {
-        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no object with address " + std::to_string(address));
     }
 }
 
@@ -338,7 +338,7 @@ bool Environment::isObjectOfType(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no object with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no object with address " + std::to_string(address));
     }
 }
 
diff --git a/extras/Hadrons/Exceptions.hpp b/extras/Hadrons/Exceptions.hpp
index adf2340f..3eb1c25f 100644
--- a/extras/Hadrons/Exceptions.hpp
+++ b/extras/Hadrons/Exceptions.hpp
@@ -34,10 +34,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Grid/Hadrons/Global.hpp>
 #endif
 
-#define SRC_LOC std::string(__FUNCTION__) + " at " + std::string(__FILE__) + ":"\
-                + std::to_string(__LINE__)
-#define HADRON_ERROR(exc, msg)\
-throw(Exceptions::exc(msg, SRC_LOC));
+#define HADRONS_SRC_LOC std::string(__FUNCTION__) + " at " \
+                        + std::string(__FILE__) + ":" + std::to_string(__LINE__)
+#define HADRONS_ERROR(exc, msg)\
+throw(Exceptions::exc(msg, HADRONS_SRC_LOC));
 
 #define DECL_EXC(name, base) \
 class name: public base\
diff --git a/extras/Hadrons/Factory.hpp b/extras/Hadrons/Factory.hpp
index 705a639e..07516640 100644
--- a/extras/Hadrons/Factory.hpp
+++ b/extras/Hadrons/Factory.hpp
@@ -94,7 +94,7 @@ std::unique_ptr<T> Factory<T>::create(const std::string type,
     }
     catch (std::out_of_range &)
     {
-        HADRON_ERROR(Argument, "object of type '" + type + "' unknown");
+        HADRONS_ERROR(Argument, "object of type '" + type + "' unknown");
     }
     
     return func(name);
diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
index 5e729579..433dcd21 100644
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -110,7 +110,7 @@ public:
 };
 
 #define LOG(channel) std::cout << HadronsLog##channel
-#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
+#define HADRONS_DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 
 extern HadronsLogger HadronsLogError;
 extern HadronsLogger HadronsLogWarning;
diff --git a/extras/Hadrons/Graph.hpp b/extras/Hadrons/Graph.hpp
index 67694aa8..ad84e7e0 100644
--- a/extras/Hadrons/Graph.hpp
+++ b/extras/Hadrons/Graph.hpp
@@ -184,7 +184,7 @@ void Graph<T>::removeVertex(const T &value)
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
     }
 
     // remove all edges containing the vertex
@@ -213,7 +213,7 @@ void Graph<T>::removeEdge(const Edge &e)
     }
     else
     {
-        HADRON_ERROR(Range, "edge does not exists");
+        HADRONS_ERROR(Range, "edge does not exists");
     }
 }
 
@@ -259,7 +259,7 @@ void Graph<T>::mark(const T &value, const bool doMark)
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
     }
 }
 
@@ -297,7 +297,7 @@ bool Graph<T>::isMarked(const T &value) const
     }
     else
     {
-        HADRON_ERROR(Range, "vertex does not exists");
+        HADRONS_ERROR(Range, "vertex does not exists");
         
         return false;
     }
@@ -543,7 +543,7 @@ std::vector<T> Graph<T>::topoSort(void)
     {
         if (tmpMarked.at(v))
         {
-            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
+            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
         }
         if (!isMarked(v))
         {
@@ -602,7 +602,7 @@ std::vector<T> Graph<T>::topoSort(Gen &gen)
     {
         if (tmpMarked.at(v))
         {
-            HADRON_ERROR(Range, "cannot topologically sort a cyclic graph");
+            HADRONS_ERROR(Range, "cannot topologically sort a cyclic graph");
         }
         if (!isMarked(v))
         {
diff --git a/extras/Hadrons/Module.cc b/extras/Hadrons/Module.cc
index 54978f93..faf01d5a 100644
--- a/extras/Hadrons/Module.cc
+++ b/extras/Hadrons/Module.cc
@@ -49,7 +49,7 @@ std::string ModuleBase::getName(void) const
 // get factory registration name if available
 std::string ModuleBase::getRegisteredName(void)
 {
-    HADRON_ERROR(Definition, "module '" + getName() + "' has no registered type"
+    HADRONS_ERROR(Definition, "module '" + getName() + "' has no registered type"
                  + " in the factory");
 }
 
diff --git a/extras/Hadrons/Module.hpp b/extras/Hadrons/Module.hpp
index 656aacef..7f8b7796 100644
--- a/extras/Hadrons/Module.hpp
+++ b/extras/Hadrons/Module.hpp
@@ -126,7 +126,7 @@ if (env().getGrid()->IsBoss())\
     \
     if (mkdir(_dirname))\
     {\
-        HADRON_ERROR(Io, "cannot create directory '" + _dirname + "'");\
+        HADRONS_ERROR(Io, "cannot create directory '" + _dirname + "'");\
     }\
     {\
         ResultWriter _writer(RESULT_FILE_NAME(ioStem));\
diff --git a/extras/Hadrons/Modules/MContraction/WardIdentity.hpp b/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
index f1ca6c2b..e529d7a2 100644
--- a/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
+++ b/extras/Hadrons/Modules/MContraction/WardIdentity.hpp
@@ -119,7 +119,7 @@ void TWardIdentity<FImpl>::setup(void)
     Ls_ = env().getObjectLs(par().q);
     if (Ls_ != env().getObjectLs(par().action))
     {
-        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
+        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
     }
     envTmpLat(PropagatorField, "tmp");
     envTmpLat(PropagatorField, "vector_WI");
diff --git a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
index 9a359427..ee21cba9 100644
--- a/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -177,7 +177,7 @@ void TGaugeProp<FImpl>::execute(void)
         {
             if (Ls_ != env().getObjectLs(par().source))
             {
-                HADRON_ERROR(Size, "Ls mismatch between quark action and source");
+                HADRONS_ERROR(Size, "Ls mismatch between quark action and source");
             }
             else
             {
diff --git a/extras/Hadrons/Modules/MScalarSUN/Div.hpp b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
index d1f6df26..1b59fbed 100644
--- a/extras/Hadrons/Modules/MScalarSUN/Div.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -112,7 +112,7 @@ void TDiv<SImpl>::setup(void)
 {
     if (par().op.size() != env().getNd())
     {
-        HADRON_ERROR(Size, "the number of components differs from number of dimensions");
+        HADRONS_ERROR(Size, "the number of components differs from number of dimensions");
     }
     envCreateLat(ComplexField, getName());
 }
diff --git a/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp b/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
index 979a340e..c7c0e9ee 100644
--- a/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
@@ -134,14 +134,14 @@ void TShiftProbe<SImpl>::execute(void)
     shift = strToVec<ShiftPair>(par().shifts);
     if (shift.size() % 2 != 0)
     {
-        HADRON_ERROR(Size, "the number of shifts is odd");
+        HADRONS_ERROR(Size, "the number of shifts is odd");
     }
     sign = (shift.size() % 4 == 0) ? 1 : -1;
     for (auto &s: shift)
     {
         if (s.first >= env().getNd())
         {
-            HADRON_ERROR(Size, "dimension to large for shift <" 
+            HADRONS_ERROR(Size, "dimension to large for shift <" 
                                + std::to_string(s.first) + " " 
                                + std::to_string(s.second) + ">" );
         }
diff --git a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
index a61c00bc..6c1d733b 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -119,7 +119,7 @@ void TTrPhi<SImpl>::setup(void)
 {
     if (par().maxPow < 2)
     {
-        HADRON_ERROR(Size, "'maxPow' should be at least equal to 2");
+        HADRONS_ERROR(Size, "'maxPow' should be at least equal to 2");
     }
     envTmpLat(Field, "phi2");
     envTmpLat(Field, "buf");
diff --git a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
index c31b1621..9e53553f 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -143,7 +143,7 @@ void TTwoPoint<SImpl>::setup(void)
         mom_[i] = strToVec<int>(par().mom[i]);
         if (mom_[i].size() != nd - 1)
         {
-            HADRON_ERROR(Size, "momentum number of components different from " 
+            HADRONS_ERROR(Size, "momentum number of components different from " 
                                + std::to_string(nd-1));
         }
     }
diff --git a/extras/Hadrons/Modules/MScalarSUN/Utils.hpp b/extras/Hadrons/Modules/MScalarSUN/Utils.hpp
index b9e49715..37a9e137 100644
--- a/extras/Hadrons/Modules/MScalarSUN/Utils.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Utils.hpp
@@ -44,7 +44,7 @@ inline void dmu(Field &out, const Field &in, const unsigned int mu, const DiffTy
 
     if (mu >= env.getNd())
     {
-        HADRON_ERROR(Range, "Derivative direction out of range");
+        HADRONS_ERROR(Range, "Derivative direction out of range");
     }
     switch(type)
     {
@@ -58,7 +58,7 @@ inline void dmu(Field &out, const Field &in, const unsigned int mu, const DiffTy
             out = 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
             break;
         default:
-            HADRON_ERROR(Argument, "Derivative type invalid");
+            HADRONS_ERROR(Argument, "Derivative type invalid");
             break;
     }
 }
@@ -70,7 +70,7 @@ inline void dmuAcc(Field &out, const Field &in, const unsigned int mu, const Dif
 
     if (mu >= env.getNd())
     {
-        HADRON_ERROR(Range, "Derivative direction out of range");
+        HADRONS_ERROR(Range, "Derivative direction out of range");
     }
     switch(type)
     {
@@ -84,7 +84,7 @@ inline void dmuAcc(Field &out, const Field &in, const unsigned int mu, const Dif
             out += 0.5*(Cshift(in, mu, 1) - Cshift(in, mu, -1));
             break;
         default:
-            HADRON_ERROR(Argument, "Derivative type invalid");
+            HADRONS_ERROR(Argument, "Derivative type invalid");
             break;
     }
 }
diff --git a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
index f559c4eb..206d44d1 100644
--- a/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
+++ b/extras/Hadrons/Modules/MSolver/RBPrecCG.hpp
@@ -127,7 +127,7 @@ void TRBPrecCG<FImpl, nBasis>::setup(void)
 {
     if (par().maxIteration == 0)
     {
-        HADRON_ERROR(Argument, "zero maximum iteration");
+        HADRONS_ERROR(Argument, "zero maximum iteration");
     }
 
     LOG(Message) << "setting up Schur red-black preconditioned CG for"
diff --git a/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp b/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
index dcd1ba9a..7b1bc1db 100644
--- a/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
+++ b/extras/Hadrons/Modules/MUtilities/TestSeqConserved.hpp
@@ -123,7 +123,7 @@ void TTestSeqConserved<FImpl>::setup(void)
     auto Ls = env().getObjectLs(par().q);
     if (Ls != env().getObjectLs(par().action))
     {
-        HADRON_ERROR(Size, "Ls mismatch between quark action and propagator");
+        HADRONS_ERROR(Size, "Ls mismatch between quark action and propagator");
     }
     envTmpLat(PropagatorField, "tmp");
     envTmpLat(LatticeComplex, "c");
diff --git a/extras/Hadrons/VirtualMachine.cc b/extras/Hadrons/VirtualMachine.cc
index dffabe29..cc197ef8 100644
--- a/extras/Hadrons/VirtualMachine.cc
+++ b/extras/Hadrons/VirtualMachine.cc
@@ -123,7 +123,7 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
                 else
                 {
                     // output already fully registered, error
-                    HADRON_ERROR(Definition, "object '" + out
+                    HADRONS_ERROR(Definition, "object '" + out
                                  + "' is already produced by module '"
                                  + module_[env().getObjectModule(out)].name
                                  + "' (while pushing module '" + name + "')");
@@ -158,7 +158,7 @@ void VirtualMachine::pushModule(VirtualMachine::ModPt &pt)
     }
     else
     {
-        HADRON_ERROR(Definition, "module '" + name + "' already exists");
+        HADRONS_ERROR(Definition, "module '" + name + "' already exists");
     }
 }
 
@@ -185,7 +185,7 @@ ModuleBase * VirtualMachine::getModule(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -202,7 +202,7 @@ unsigned int VirtualMachine::getModuleAddress(const std::string name) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with name '" + name + "'");
+        HADRONS_ERROR(Definition, "no module with name '" + name + "'");
     }
 }
 
@@ -214,7 +214,7 @@ std::string VirtualMachine::getModuleName(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -226,7 +226,7 @@ std::string VirtualMachine::getModuleType(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "no module with address " + std::to_string(address));
+        HADRONS_ERROR(Definition, "no module with address " + std::to_string(address));
     }
 }
 
@@ -306,7 +306,7 @@ void VirtualMachine::makeModuleGraph(void)
 
             if (min < 0)
             {
-                HADRON_ERROR(Definition, "dependency '" 
+                HADRONS_ERROR(Definition, "dependency '" 
                              + env().getObjectName(in) + "' (address " 
                              + std::to_string(in)
                              + ") is not produced by any module");
diff --git a/extras/Hadrons/VirtualMachine.hpp b/extras/Hadrons/VirtualMachine.hpp
index 153f8d70..ccc06d63 100644
--- a/extras/Hadrons/VirtualMachine.hpp
+++ b/extras/Hadrons/VirtualMachine.hpp
@@ -195,7 +195,7 @@ M * VirtualMachine::getModule(const unsigned int address) const
     }
     else
     {
-        HADRON_ERROR(Definition, "module '" + module_[address].name
+        HADRONS_ERROR(Definition, "module '" + module_[address].name
                      + "' does not have type " + typeid(M).name()
                      + "(has type: " + getModuleType(address) + ")");
     }
diff --git a/lib/qcd/modules/Factory.h b/lib/qcd/modules/Factory.h
index 66bd627e..fafa2038 100644
--- a/lib/qcd/modules/Factory.h
+++ b/lib/qcd/modules/Factory.h
@@ -95,7 +95,7 @@ std::unique_ptr<T> Factory<T, CreatorInput>::create(const std::string type,
     }
     catch (std::out_of_range &)
     {
-      //HADRON_ERROR("object of type '" + type + "' unknown");
+      //HADRONS_ERROR("object of type '" + type + "' unknown");
     	std::cout << GridLogError << "Error" << std::endl;
     	std::cout << GridLogError << obj_type() << " object of name [" << type << "] unknown" << std::endl;
     	exit(1);

From 43f5a0df50090b2f784653a3d3b2e569ae08b1a4 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 26 Apr 2018 12:01:56 +0900
Subject: [PATCH 07/28] More timers in the integrator

---
 benchmarks/Benchmark_su3.cc           |  22 ++---
 lib/qcd/hmc/integrators/Integrator.h  |  12 ++-
 lib/qcd/smearing/GaugeConfiguration.h | 137 ++++++++++++++++----------
 3 files changed, 104 insertions(+), 67 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index d9f1341c..5f2d83d2 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,8 +35,8 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
-#define LMAX (16)
-#define LMIN (16)
+#define LMAX (32)
+#define LMIN (4)
 #define LINC (4)
 
   int64_t Nloop=2000;
@@ -193,17 +193,17 @@ int main (int argc, char ** argv)
       LatticeColourMatrix y(&Grid); random(pRNG,y);
 
       for(int mu=0;mu<4;mu++){
-	double start=usecond();
-	for(int64_t i=0;i<Nloop;i++){
-	  z = PeriodicBC::CovShiftForward(x,mu,y);
-	}
-	double stop=usecond();
-	double time = (stop-start)/Nloop*1000.0;
+	      double start=usecond();
+	      for(int64_t i=0;i<Nloop;i++){
+	        z = PeriodicBC::CovShiftForward(x,mu,y);
+	    }
+	    double stop=usecond();
+	    double time = (stop-start)/Nloop*1000.0;
 	
 	
-	double bytes=3*vol*Nc*Nc*sizeof(Complex);
-	double flops=Nc*Nc*(6+8+8)*vol;
-	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+	    double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	    double flops=Nc*Nc*(6+8+8)*vol;
+	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
   }
 
diff --git a/lib/qcd/hmc/integrators/Integrator.h b/lib/qcd/hmc/integrators/Integrator.h
index 95b19904..232eba42 100644
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -114,18 +114,26 @@ class Integrator {
     // input U actually not used in the fundamental case
     // Fundamental updates, include smearing
 
-   for (int a = 0; a < as[level].actions.size(); ++a) {
+    for (int a = 0; a < as[level].actions.size(); ++a) {
+      double start_full = usecond();
       Field force(U._grid);
       conformable(U._grid, Mom._grid);
+
       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+      double start_force = usecond();
       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
 
       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
       force = FieldImplementation::projectForce(force); // Ta for gauge fields
+      double end_force = usecond();
       Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
-      std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
       Mom -= force * ep; 
+      double end_full = usecond();
+      double time_full  = (end_full - start_full) / 1e3;
+      double time_force = (end_force - start_force) / 1e3;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
     }
 
     // Force from the other representations
diff --git a/lib/qcd/smearing/GaugeConfiguration.h b/lib/qcd/smearing/GaugeConfiguration.h
index fc045ba2..55d5351f 100644
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@@ -6,30 +6,33 @@
 #ifndef GAUGE_CONFIG_
 #define GAUGE_CONFIG_
 
-namespace Grid {
+namespace Grid
+{
 
-namespace QCD {
+namespace QCD
+{
 
-  //trivial class for no smearing
-  template< class Impl >
-class NoSmearing {
+//trivial class for no smearing
+template <class Impl>
+class NoSmearing
+{
 public:
   INHERIT_FIELD_TYPES(Impl);
 
-  Field* ThinField;
+  Field *ThinField;
 
-  NoSmearing(): ThinField(NULL) {}
+  NoSmearing() : ThinField(NULL) {}
 
-  void set_Field(Field& U) { ThinField = &U; }
+  void set_Field(Field &U) { ThinField = &U; }
 
-  void smeared_force(Field&) const {}
+  void smeared_force(Field &) const {}
 
-  Field& get_SmearedU() { return *ThinField; }
+  Field &get_SmearedU() { return *ThinField; }
 
-  Field& get_U(bool smeared = false) {
+  Field &get_U(bool smeared = false)
+  {
     return *ThinField;
   }
-
 };
 
 /*!
@@ -44,32 +47,36 @@ public:
   It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration {
- public:
+class SmearedConfiguration
+{
+public:
   INHERIT_GIMPL_TYPES(Gimpl);
 
- private:
+private:
   const unsigned int smearingLevels;
   Smear_Stout<Gimpl> StoutSmearing;
   std::vector<GaugeField> SmearedSet;
 
   // Member functions
   //====================================================================
-  void fill_smearedSet(GaugeField& U) {
-    ThinLinks = &U;  // attach the smearing routine to the field U
+  void fill_smearedSet(GaugeField &U)
+  {
+    ThinLinks = &U; // attach the smearing routine to the field U
 
     // check the pointer is not null
     if (ThinLinks == NULL)
       std::cout << GridLogError
                 << "[SmearedConfiguration] Error in ThinLinks pointer\n";
 
-    if (smearingLevels > 0) {
+    if (smearingLevels > 0)
+    {
       std::cout << GridLogDebug
                 << "[SmearedConfiguration] Filling SmearedSet\n";
       GaugeField previous_u(ThinLinks->_grid);
 
       previous_u = *ThinLinks;
-      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
+      {
         StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
         previous_u = SmearedSet[smearLvl];
 
@@ -81,9 +88,10 @@ class SmearedConfiguration {
     }
   }
   //====================================================================
-  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const {
-    GridBase* grid = GaugeK._grid;
+  GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime,
+                                  const GaugeField &GaugeK) const
+  {
+    GridBase *grid = GaugeK._grid;
     GaugeField C(grid), SigmaK(grid), iLambda(grid);
     GaugeLinkField iLambda_mu(grid);
     GaugeLinkField iQ(grid), e_iQ(grid);
@@ -94,7 +102,8 @@ class SmearedConfiguration {
     SigmaK = zero;
     iLambda = zero;
 
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Nd; mu++)
+    {
       Cmu = peekLorentz(C, mu);
       GaugeKmu = peekLorentz(GaugeK, mu);
       SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@@ -104,20 +113,22 @@ class SmearedConfiguration {
       pokeLorentz(iLambda, iLambda_mu, mu);
     }
     StoutSmearing.derivative(SigmaK, iLambda,
-                             GaugeK);  // derivative of SmearBase
+                             GaugeK); // derivative of SmearBase
     return SigmaK;
   }
 
   /*! @brief Returns smeared configuration at level 'Level' */
-  const GaugeField& get_smeared_conf(int Level) const {
+  const GaugeField &get_smeared_conf(int Level) const
+  {
     return SmearedSet[Level];
   }
 
   //====================================================================
-  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
-                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
-                   const GaugeLinkField& GaugeK) const {
-    GridBase* grid = iQ._grid;
+  void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ,
+                   const GaugeLinkField &iQ, const GaugeLinkField &Sigmap,
+                   const GaugeLinkField &GaugeK) const
+  {
+    GridBase *grid = iQ._grid;
     GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
     GaugeLinkField unity(grid);
     unity = 1.0;
@@ -206,15 +217,15 @@ class SmearedConfiguration {
   }
 
   //====================================================================
- public:
-  GaugeField*
-      ThinLinks; /*!< @brief Pointer to the thin
-                                                         links configuration */
+public:
+  GaugeField *
+      ThinLinks; /* Pointer to the thin links configuration */
 
-  /*! @brief Standard constructor */
-  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
-                       Smear_Stout<Gimpl>& Stout)
-      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+  /* Standard constructor */
+  SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear,
+                       Smear_Stout<Gimpl> &Stout)
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
+  {
     for (unsigned int i = 0; i < smearingLevels; ++i)
       SmearedSet.push_back(*(new GaugeField(UGrid)));
   }
@@ -223,21 +234,29 @@ class SmearedConfiguration {
   SmearedConfiguration()
       : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
 
-
-  
   // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField& U) { fill_smearedSet(U); }
+  void set_Field(GaugeField &U)
+  {
+    double start = usecond();
+    fill_smearedSet(U);
+    double end = usecond();
+    double time = (stop - start)/ 1e3;
+    std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;  
+  }
 
   //====================================================================
-  void smeared_force(GaugeField& SigmaTilde) const {
-    if (smearingLevels > 0) {
+  void smeared_force(GaugeField &SigmaTilde) const
+  {
+    if (smearingLevels > 0)
+    {
+      double start = usecond();
       GaugeField force = SigmaTilde; // actually = U*SigmaTilde
       GaugeLinkField tmp_mu(SigmaTilde._grid);
 
-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
         // to get just SigmaTilde
-        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
-                 peekLorentz(force, mu);
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
         pokeLorentz(force, tmp_mu, mu);
       }
 
@@ -246,33 +265,43 @@ class SmearedConfiguration {
 
       force = AnalyticSmearedForce(force, *ThinLinks);
 
-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
         tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
         pokeLorentz(SigmaTilde, tmp_mu, mu);
       }
-    }  // if smearingLevels = 0 do nothing
+      double end = usecond();
+      double time = (stop - start)/ 1e3;
+      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
+    } // if smearingLevels = 0 do nothing
   }
   //====================================================================
 
-  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
 
-  GaugeField& get_U(bool smeared = false) {
+  GaugeField &get_U(bool smeared = false)
+  {
     // get the config, thin links by default
-    if (smeared) {
-      if (smearingLevels) {
+    if (smeared)
+    {
+      if (smearingLevels)
+      {
         RealD impl_plaq =
             WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
         std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
                   << std::endl;
         return get_SmearedU();
-
-      } else {
+      }
+      else
+      {
         RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
         std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                   << std::endl;
         return *ThinLinks;
       }
-    } else {
+    }
+    else
+    {
       RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
       std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                 << std::endl;

From 6358f35b7e93d36f1f9a4378ddd53cc2a88242be Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Thu, 26 Apr 2018 14:18:11 +0900
Subject: [PATCH 08/28] Debug of previous commit

---
 lib/qcd/smearing/GaugeConfiguration.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/qcd/smearing/GaugeConfiguration.h b/lib/qcd/smearing/GaugeConfiguration.h
index 55d5351f..6fea875b 100644
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@@ -240,7 +240,7 @@ public:
     double start = usecond();
     fill_smearedSet(U);
     double end = usecond();
-    double time = (stop - start)/ 1e3;
+    double time = (end - start)/ 1e3;
     std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;  
   }
 
@@ -271,7 +271,7 @@ public:
         pokeLorentz(SigmaTilde, tmp_mu, mu);
       }
       double end = usecond();
-      double time = (stop - start)/ 1e3;
+      double time = (end - start)/ 1e3;
       std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
     } // if smearingLevels = 0 do nothing
   }

From 8f44c799a69d3041f00af7b7785a268a914ed6c5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 14:48:03 +0100
Subject: [PATCH 09/28] Saving the benchmarking tests for Cshift

---
 benchmarks/Benchmark_su3.cc | 64 +++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index b31af942..656f816a 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,24 +35,25 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
+#define LMIN (16)
 #define LMAX (40)
-#define LINC (4)
+#define LINC (8)
 
-  int64_t Nloop=20;
+  int64_t Nloop=200;
 
   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
   int64_t threads = GridThread::GetThreads();
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
+#if 0
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=4;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -84,7 +85,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -115,7 +116,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -146,7 +147,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
     
     std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
     int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -170,7 +171,6 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
     
   }
-  
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
@@ -178,7 +178,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 
-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 
@@ -203,6 +203,52 @@ int main (int argc, char ** argv)
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
     }
+#endif  
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix tmp(&Grid);
+
+      for(int mu=0;mu<4;mu++){
+	double tshift=0;
+	double tmult =0;
+
+	double start=usecond();
+	for(int64_t i=0;i<Nloop;i++){
+	  tshift-=usecond();
+	  tmp = Cshift(y,mu,-1);
+	  tshift+=usecond();
+	  tmult-=usecond();
+	  z   = x*tmp;
+	  tmult+=usecond();
+	}
+	double stop=usecond();
+	double time = (stop-start)/Nloop;
+	tshift = tshift/Nloop;
+	tmult  = tmult /Nloop;
+	
+	double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	double flops=Nc*Nc*(6+8+8)*vol;
+	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
+	time = time * 1000; // convert to NS for GB/s
+	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      }
+    }
+
 
   Grid_finalize();
 }

From 91a0a3f8204fc2b070a7a913a1cd1fa29d4a45ac Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 14:48:35 +0100
Subject: [PATCH 10/28] Improvement

---
 benchmarks/Benchmark_memory_bandwidth.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 848f271d..cc965050 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=96;
+  uint64_t lmax=64;
 #define NLOOP (10*lmax*lmax*lmax*lmax/vol)
   for(int lat=8;lat<=lmax;lat+=8){
 

From 3b0638174519e83fdb3937bef006dd12c183017c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 14:48:57 +0100
Subject: [PATCH 11/28] Guard bare openmp statemetn with ifdef

---
 lib/allocator/AlignedAllocator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 3b27aec9..b0f7e206 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -277,7 +277,9 @@ public:
     uint8_t *cp = (uint8_t *)ptr;
     if ( ptr ) { 
     // One touch per 4k page, static OMP loop to catch same loop order
+#ifdef GRID_OMP
 #pragma omp parallel for schedule(static)
+#endif
       for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
       }

From 28a375d35de18cc3efa3f28301ab2d731daaaf3b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 14:49:42 +0100
Subject: [PATCH 12/28] Force static

---
 lib/threads/Threads.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/threads/Threads.h b/lib/threads/Threads.h
index 36daf2af..dacaf5d8 100644
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@@ -40,7 +40,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for schedule(static) collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
 #else

From 03e9832efa55892c00e028fd4601220c7378a13d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 14:50:02 +0100
Subject: [PATCH 13/28] Use macros for bare openmp

---
 lib/parallelIO/BinaryIO.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 45fd522e..ce84fc81 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -110,11 +110,11 @@ class BinaryIO {
       lsites = 1;
     }
 
-    #pragma omp parallel
+PARALLEL_REGION
     {
       uint32_t nersc_csum_thr = 0;
 
-      #pragma omp for
+PARALLEL_FOR_LOOP_INTERN
       for (uint64_t local_site = 0; local_site < lsites; local_site++)
       {
         uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
@@ -124,7 +124,7 @@ class BinaryIO {
         }
       }
 
-      #pragma omp critical
+PARALLEL_CRITICAL
       {
         nersc_csum += nersc_csum_thr;
       }
@@ -146,14 +146,14 @@ class BinaryIO {
     std::vector<int> local_start =grid->LocalStarts();
     std::vector<int> global_vol  =grid->FullDimensions();
 
-#pragma omp parallel
+PARALLEL_REGION
     { 
       std::vector<int> coor(nd);
       uint32_t scidac_csuma_thr=0;
       uint32_t scidac_csumb_thr=0;
       uint32_t site_crc=0;
 
-#pragma omp for
+PARALLEL_FOR_LOOP_INTERN
       for(uint64_t local_site=0;local_site<lsites;local_site++){
 
 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@@ -183,7 +183,7 @@ class BinaryIO {
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
       }
 
-#pragma omp critical
+PARALLEL_CRITICAL
       {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;

From d47484717e91db165f8e1d35446689dd2400e067 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 26 Apr 2018 17:32:37 +0100
Subject: [PATCH 14/28] Hadrons: scalar SU(N) result handling improvement

---
 extras/Hadrons/Modules/MScalarSUN/Div.hpp     | 17 +++---
 extras/Hadrons/Modules/MScalarSUN/EMT.hpp     | 35 ++++++++++-
 extras/Hadrons/Modules/MScalarSUN/Grad.hpp    | 35 +++++------
 .../Hadrons/Modules/MScalarSUN/ShiftProbe.hpp | 24 +++++---
 .../Hadrons/Modules/MScalarSUN/TrKinetic.hpp  | 60 ++++++++-----------
 extras/Hadrons/Modules/MScalarSUN/TrMag.hpp   | 21 +++----
 extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp   | 22 +++----
 .../Hadrons/Modules/MScalarSUN/TransProj.hpp  | 42 ++++++-------
 .../Hadrons/Modules/MScalarSUN/TwoPoint.hpp   | 24 ++++----
 9 files changed, 156 insertions(+), 124 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalarSUN/Div.hpp b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
index 1b59fbed..ff26c60b 100644
--- a/extras/Hadrons/Modules/MScalarSUN/Div.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Div.hpp
@@ -49,19 +49,20 @@ public:
                                     std::string,              output);
 };
 
+class DivResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(DivResult,
+                                    DiffType, type,
+                                    Complex,  value);
+};
+
 template <typename SImpl>
 class TDiv: public Module<DivPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        DiffType, type,
-                                        Complex,  value);
-    };
 public:
     // constructor
     TDiv(const std::string name);
@@ -139,7 +140,7 @@ void TDiv<SImpl>::execute(void)
     }
     if (!par().output.empty())
     {
-        Result       r;
+        DivResult r;
 
         r.type  = par().type;
         r.value = TensorRemove(sum(div));
diff --git a/extras/Hadrons/Modules/MScalarSUN/EMT.hpp b/extras/Hadrons/Modules/MScalarSUN/EMT.hpp
index 025b7936..dbbfb6b3 100644
--- a/extras/Hadrons/Modules/MScalarSUN/EMT.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/EMT.hpp
@@ -54,6 +54,17 @@ public:
                                     std::string, output);
 };
 
+class EMTResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(EMTResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    double,                            m2,
+                                    double,                            lambda,
+                                    double,                            g,
+                                    double,                            xi);
+};
+
 template <typename SImpl>
 class TEMT: public Module<EMTPar>
 {
@@ -155,13 +166,22 @@ void TEMT<SImpl>::execute(void)
         LOG(Message) << "             xi= " << par().xi << std::endl;
     }
 
-    const unsigned int N       = SImpl::Group::Dimension;
+    const unsigned int N = SImpl::Group::Dimension, nd = env().getNd();
     auto               &trphi2 = envGet(ComplexField, varName(par().phiPow, 2));
     auto               &trphi4 = envGet(ComplexField, varName(par().phiPow, 4));
     auto               &sumkin = envGet(ComplexField, varName(par().kinetic, "sum"));
+    EMTResult          result;
 
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    if (!par().output.empty())
+    {
+        result.m2     = par().m2;
+        result.g      = par().g;
+        result.lambda = par().lambda;
+        result.xi     = par().xi;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
     {
         auto &out   = envGet(ComplexField, varName(getName(), mu, nu));
         auto &trkin = envGet(ComplexField, varName(par().kinetic, mu, nu));
@@ -178,6 +198,15 @@ void TEMT<SImpl>::execute(void)
             out -= sumkin + par().m2*trphi2 + par().lambda*trphi4;
         }
         out *= N/par().g;
+        if (!par().output.empty())
+        {
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
+        }
+    }
+    if (!par().output.empty())
+    {
+        saveResult(par().output, "emt", result);
     }
 }
 
diff --git a/extras/Hadrons/Modules/MScalarSUN/Grad.hpp b/extras/Hadrons/Modules/MScalarSUN/Grad.hpp
index 7718fbf2..ecf65e90 100644
--- a/extras/Hadrons/Modules/MScalarSUN/Grad.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/Grad.hpp
@@ -49,19 +49,20 @@ public:
                                     std::string, output);
 };
 
+class GradResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(GradResult,
+                                    DiffType,              type,
+                                    std::vector<Complex>,  value);
+};
+
 template <typename SImpl>
 class TGrad: public Module<GradPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        DiffType, type,
-                                        Complex,  value);
-    };
 public:
     // constructor
     TGrad(const std::string name);
@@ -130,14 +131,18 @@ void TGrad<SImpl>::setup(void)
 template <typename SImpl>
 void TGrad<SImpl>::execute(void)
 {
-    const auto nd = env().getNd();
-
     LOG(Message) << "Computing the " << par().type << " gradient of '"
                  << par().op << "'" << std::endl;
 
-    std::vector<Result> result;
-    auto                &op = envGet(ComplexField, par().op);
+    const unsigned int nd = env().getNd();
+    GradResult         result;
+    auto               &op = envGet(ComplexField, par().op);
 
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd);
+    }
     for (unsigned int mu = 0; mu < nd; ++mu)
     {
         auto &der = envGet(ComplexField, varName(getName(), mu));
@@ -145,14 +150,10 @@ void TGrad<SImpl>::execute(void)
         dmu(der, op, mu, par().type);
         if (!par().output.empty())
         {
-            Result r;
-
-            r.type  = par().type;
-            r.value = TensorRemove(sum(der));
-            result.push_back(r);
+            result.value[mu] = TensorRemove(sum(der));
         }
     }
-    if (result.size() > 0)
+    if (!par().output.empty())
     {
         saveResult(par().output, "grad", result);
     }
diff --git a/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp b/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
index c7c0e9ee..cd7c15eb 100644
--- a/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/ShiftProbe.hpp
@@ -51,20 +51,20 @@ public:
                                     std::string, output);
 };
 
+class ShiftProbeResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ShiftProbeResult,
+                                    std::string, shifts,
+                                    Complex,     value);
+};
+
 template <typename SImpl>
 class TShiftProbe: public Module<ShiftProbePar>
 {
 public:
-    
     typedef typename SImpl::Field                          Field;
     typedef typename SImpl::ComplexField                   ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, op,
-                                        Complex    , value);
-    };
 public:
     // constructor
     TShiftProbe(const std::string name);
@@ -160,6 +160,14 @@ void TShiftProbe<SImpl>::execute(void)
         }
     }
     probe = real(sign*trace(acc));
+    if (!par().output.empty())
+    {
+        ShiftProbeResult r;
+
+        r.shifts = par().shifts;
+        r.value  = TensorRemove(sum(probe));
+        saveResult(par().output, "probe", r);
+    }
 }
 
 END_MODULE_NAMESPACE
diff --git a/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp b/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
index 59aa27b8..a714daaa 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrKinetic.hpp
@@ -49,19 +49,20 @@ public:
                                     std::string,  output);
 };
 
+class TrKineticResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrKineticResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    DiffType,                          type);
+};
+
 template <typename SImpl>
 class TTrKinetic: public Module<TrKineticPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, op,
-                                        Complex    , value);
-    };
 public:
     // constructor
     TTrKinetic(const std::string name);
@@ -135,18 +136,24 @@ void TTrKinetic<SImpl>::execute(void)
     LOG(Message) << "Computing tr(d_mu phi*d_nu phi) using " << par().type
                  << " derivative" << std::endl; 
 
-    std::vector<Result> result;
-    auto                &phi    = envGet(Field, par().field);
-    auto                &sumkin = envGet(ComplexField, varName(getName(), "sum"));
+    const unsigned int nd = env().getNd();
+    TrKineticResult    result;
+    auto               &phi    = envGet(Field, par().field);
+    auto               &sumkin = envGet(ComplexField, varName(getName(), "sum"));
 
     envGetTmp(std::vector<Field>, der);
     sumkin = zero;
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
     {
         dmu(der[mu], phi, mu, par().type);
     }
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
     {
         auto &out = envGet(ComplexField, varName(getName(), mu, nu));
 
@@ -155,32 +162,13 @@ void TTrKinetic<SImpl>::execute(void)
         {
             sumkin += out;
         }
-    }
-    if (!par().output.empty())
-    {
-        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-        for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+        if (!par().output.empty())
         {
-            auto &out = envGet(ComplexField, varName(getName(), mu, nu));
-            Result r;
-
-            r.op    = "tr(d_" + std::to_string(mu) + "phi*d_" 
-                      + std::to_string(nu) + "phi)";
-            r.value = TensorRemove(sum(out));
-            result.push_back(r);
-        }
-        {
-            Result r;
-
-            r.op    = "sum_mu tr(d_mu phi*d_mu phi)";
-            r.value = TensorRemove(sum(sumkin));
-            result.push_back(r);
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
         }
     }
-    if (result.size() > 0)
-    {
-        saveResult(par().output, "trkinetic", result);
-    }
+    saveResult(par().output, "trkinetic", result);
 }
 
 END_MODULE_NAMESPACE
diff --git a/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp b/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
index ed1a629a..cdbf7e30 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrMag.hpp
@@ -49,19 +49,20 @@ public:
                                     std::string,  output);
 };
 
+class TrMagResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrMagResult,
+                                    std::string, op,
+                                    Real,        value);
+};
+
 template <typename SImpl>
 class TTrMag: public Module<TrMagPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, op,
-                                        Real,        value);
-    };
 public:
     // constructor
     TTrMag(const std::string name);
@@ -120,8 +121,8 @@ void TTrMag<SImpl>::execute(void)
     LOG(Message) << "Computing tr(mag^n) for n even up to " << par().maxPow
                  << std::endl;
 
-    std::vector<Result> result;
-    auto                &phi = envGet(Field, par().field);
+    std::vector<TrMagResult> result;
+    auto                     &phi = envGet(Field, par().field);
 
     auto m2 = sum(phi), mn = m2;
 
@@ -129,7 +130,7 @@ void TTrMag<SImpl>::execute(void)
     mn = 1.;
     for (unsigned int n = 2; n <= par().maxPow; n += 2)
     {
-        Result r;
+        TrMagResult r;
 
         mn = mn*m2;
         r.op    = "tr(mag^" + std::to_string(n) + ")";
diff --git a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
index 6c1d733b..9be0a5d6 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TrPhi.hpp
@@ -49,19 +49,21 @@ public:
                                     std::string,  output);
 };
 
+class TrPhiResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TrPhiResult,
+                                    std::string, op,
+                                    Real,        value);
+};
+
 template <typename SImpl>
 class TTrPhi: public Module<TrPhiPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, op,
-                                        Real,        value);
-    };
+
 public:
     // constructor
     TTrPhi(const std::string name);
@@ -136,8 +138,8 @@ void TTrPhi<SImpl>::execute(void)
     LOG(Message) << "Computing tr(phi^n) for n even up to " << par().maxPow
                  << std::endl; 
 
-    std::vector<Result> result;
-    auto                &phi = envGet(Field, par().field);
+    std::vector<TrPhiResult> result;
+    auto                     &phi = envGet(Field, par().field);
 
     envGetTmp(Field, phi2);
     envGetTmp(Field, buf);
@@ -151,7 +153,7 @@ void TTrPhi<SImpl>::execute(void)
         phin = trace(buf);
         if (!par().output.empty())
         {
-            Result r;
+            TrPhiResult r;
 
             r.op    = "tr(phi^" + std::to_string(n) + ")";
             r.value = TensorRemove(sum(phin)).real();
diff --git a/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp b/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
index 6c6502fc..c9b42bf0 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TransProj.hpp
@@ -49,19 +49,20 @@ public:
                                     std::string,  output);
 };
 
+class TransProjResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TransProjResult,
+                                    std::vector<std::vector<Complex>>, value,
+                                    DiffType,                          type);
+};
+
 template <typename SImpl>
 class TTransProj: public Module<TransProjPar>
 {
 public:
     typedef typename SImpl::Field        Field;
     typedef typename SImpl::ComplexField ComplexField;
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, op,
-                                        Complex    , value);
-    };
 public:
     // constructor
     TTransProj(const std::string name);
@@ -137,21 +138,27 @@ void TTransProj<SImpl>::execute(void)
                  << par().type << " derivatives and op= '" << par().op 
                  << "'" << std::endl; 
 
-    std::vector<Result> result;
-    auto                &op = envGet(ComplexField, par().op);
+    const unsigned int nd = env().getNd();
+    TransProjResult    result;
+    auto               &op = envGet(ComplexField, par().op);
 
     envGetTmp(ComplexField, buf1);
     envGetTmp(ComplexField, buf2);
     envGetTmp(ComplexField, lap);
     lap = zero;
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
+    if (!par().output.empty())
+    {
+        result.type = par().type;
+        result.value.resize(nd, std::vector<Complex>(nd));
+    }
+    for (unsigned int mu = 0; mu < nd; ++mu)
     {
         dmu(buf1, op, mu, par().type);
         dmu(buf2, buf1, mu, par().type);
         lap += buf2;
     }
-    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
-    for (unsigned int nu = mu; nu < env().getNd(); ++nu)
+    for (unsigned int mu = 0; mu < nd; ++mu)
+    for (unsigned int nu = mu; nu < nd; ++nu)
     {
         auto &out = envGet(ComplexField, varName(getName(), mu, nu));
         dmu(buf1, op, mu, par().type);
@@ -163,16 +170,11 @@ void TTransProj<SImpl>::execute(void)
         }
         if (!par().output.empty())
         {
-            Result r;
-
-            r.op    = "(delta_" + std::to_string(mu) + "," + std::to_string(nu)
-                      + " d^2 - d_" + std::to_string(mu) + "*d_" 
-                      + std::to_string(nu) + ")*op";
-            r.value = TensorRemove(sum(out));
-            result.push_back(r);
+            result.value[mu][nu] = TensorRemove(sum(out));
+            result.value[mu][nu] = result.value[nu][mu];
         }
     }
-    if (result.size() > 0)
+    if (!par().output.empty())
     {
         saveResult(par().output, "transproj", result);
     }
diff --git a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
index 9e53553f..abca6212 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -50,6 +50,16 @@ public:
                                     std::string,              output);
 };
 
+class TwoPointResult: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TwoPointResult,
+                                    std::string, sink,
+                                    std::string, source,
+                                    std::vector<int>, mom,
+                                    std::vector<Complex>, data);
+};
+
 template <typename SImpl>
 class TTwoPoint: public Module<TwoPointPar>
 {
@@ -57,16 +67,6 @@ public:
     typedef typename SImpl::Field          Field;
     typedef typename SImpl::ComplexField   ComplexField;
     typedef          std::vector<TComplex> SlicedOp;
-
-    class Result: Serializable
-    {
-    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::string, sink,
-                                        std::string, source,
-                                        std::vector<int>, mom,
-                                        std::vector<Complex>, data);
-    };
 public:
     // constructor
     TTwoPoint(const std::string name);
@@ -166,7 +166,7 @@ void TTwoPoint<SImpl>::execute(void)
     const unsigned int                           nmom = mom_.size();
     std::vector<int>                             dMask(nd, 1);
     std::set<std::string>                        ops;
-    std::vector<Result>                          result;
+    std::vector<TwoPointResult>                  result;
     std::map<std::string, std::vector<SlicedOp>> slicedOp;
     FFT                                          fft(env().getGrid());
 
@@ -201,7 +201,7 @@ void TTwoPoint<SImpl>::execute(void)
     for (unsigned int m = 0; m < nmom; ++m)
     for (auto &p: par().op)
     {
-        Result r;
+        TwoPointResult r;
 
         r.sink   = p.first;
         r.source = p.second;

From fa0d8feff421001740acf3a1d039ec3e86980164 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 17:56:27 +0100
Subject: [PATCH 15/28] Performance of CovariantCshift now non-embarrassing.

---
 benchmarks/Benchmark_su3.cc |  6 +--
 lib/cshift/Cshift_common.h  | 93 ++++++++++++++++++++++---------------
 lib/lattice/Lattice_base.h  |  3 +-
 3 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 7e5436b1..7b1b2c1a 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -36,7 +36,7 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 #define LMAX (32)
-#define LMIN (4)
+#define LMIN (16)
 #define LINC (4)
 
   int64_t Nloop=2000;
@@ -204,7 +204,7 @@ int main (int argc, char ** argv)
 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
   }
-
+#if 1
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@@ -248,6 +248,6 @@ int main (int argc, char ** argv)
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
       }
     }
-
+#endif
   Grid_finalize();
 }
diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index 1be672e8..b2f87315 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
   int e1=rhs._grid->_slice_nblock[dimension];
   int e2=rhs._grid->_slice_block[dimension];
+  int ent = 0;
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
 
   int stride=rhs._grid->_slice_stride[dimension];
   if ( cbmask == 0x3 ) { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	buffer[off+bo+b]=rhs._odata[so+o+b];
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
       }
     }
   } else { 
      int bo=0;
-     std::vector<std::pair<int,int> > table;
      for(int n=0;n<e1;n++){
        for(int b=0;b<e2;b++){
 	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table.push_back(std::pair<int,int> (bo++,o+b));
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
        }
      }
-     parallel_for(int i=0;i<table.size();i++){
-       buffer[off+table[i].first]=rhs._odata[so+table[i].second];
-     }
+  }
+  parallel_for(int i=0;i<ent;i++){
+    buffer[table[i].first]=rhs._odata[table[i].second];
   }
 }
 
@@ -140,31 +142,35 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
   int e1=rhs._grid->_slice_nblock[dimension];
   int e2=rhs._grid->_slice_block[dimension];
   int stride=rhs._grid->_slice_stride[dimension];
-  
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
   if ( cbmask ==0x3 ) {
-    parallel_for_nest2(int n=0;n<e1;n++){
+
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	rhs._odata[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo);
       }
     }
+
   } else { 
-    std::vector<std::pair<int,int> > table;
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table.push_back(std::pair<int,int> (so+o+b,bo++));
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
       }
     }
-    parallel_for(int i=0;i<table.size();i++){
-       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
-       rhs._odata[table[i].first]=buffer[table[i].second];
-     }
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    rhs._odata[table[i].first]=buffer[table[i].second];
   }
 }
 
@@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
   int e2=rhs._grid->_slice_block[dimension];
   int stride = rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;
+
   if(cbmask == 0x3 ){
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
- 
         int o =n*stride+b;
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
       }
     }
   } else { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
- 
         int o =n*stride+b;
         int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
         if ( ocb&cbmask ) {
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
       }
     }
   }
-  
+
+  parallel_for(int i=0;i<ent;i++){
+    lhs._odata[table[i].first]=rhs._odata[table[i].second];
+  }
+
 }
 
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   int e2=rhs._grid->_slice_block [dimension];
   int stride = rhs._grid->_slice_stride[dimension];
 
-  parallel_for_nest2(int n=0;n<e1;n++){
-  for(int b=0;b<e2;b++){
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;
 
+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
       int o  =n*stride;
       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
-	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
-      }
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }
 
-  }}
+  parallel_for(int i=0;i<ent;i++){
+    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
+  }
 }
 
 //////////////////////////////////////////////////////
@@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
 
+  double t_local;
+  
   if ( sshift[0] == sshift[1] ) {
     Cshift_local(ret,rhs,dimension,shift,0x3);
   } else {
@@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
   }
 }
 
-template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
   GridBase *grid = rhs._grid;
   int fd = grid->_fdimensions[dimension];
@@ -325,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 
     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
     int sx     = (x+sshift)%rd;
-
-    // FIXME : This must change where we have a 
-    // Rotate slice.
     
-    // Document how this works ; why didn't I do this when I first wrote it...
     // wrap is whether sshift > rd.
     //  num is sshift mod rd.
     // 
@@ -365,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
 
     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
     else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
   
   }
-  return ret;
 }
 }
 #endif
diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index 98713c14..1169d18f 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -256,7 +256,7 @@ public:
       _odata[ss]=r._odata[ss];
     }  	
   }
-  
+
   Lattice(Lattice&& r){ // move constructor
     _grid = r._grid;
     checkerboard = r.checkerboard;
@@ -270,6 +270,7 @@ public:
     _odata       =std::move(r._odata);
     return *this;
   }
+
   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
     _grid        = r._grid;
     checkerboard = r.checkerboard;

From e9f1ac09ded186335c465b54bed60f3a44477ab9 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 23:00:08 +0100
Subject: [PATCH 16/28] static

---
 lib/threads/Threads.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/threads/Threads.h b/lib/threads/Threads.h
index 36daf2af..dacaf5d8 100644
--- a/lib/threads/Threads.h
+++ b/lib/threads/Threads.h
@@ -40,7 +40,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for schedule(static) collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
 #else

From 7ecc47ac89ddb310583fe5b548fd0804dfb6e0ce Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 23:00:28 +0100
Subject: [PATCH 17/28] Quenched test compile

---
 lib/lattice/Lattice_comparison_utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lattice/Lattice_comparison_utils.h b/lib/lattice/Lattice_comparison_utils.h
index 9580d4d2..579449f1 100644
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -198,7 +198,7 @@ namespace Grid {
       typedef typename vsimd::scalar_type scalar;\
       return Comparison(functor<scalar,scalar>(),lhs,rhs);\
     }\
-  template<class vsimd,IfSimd<vsimd> = 0>\
+  template<class vsimd>\
     inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
     {									\
       return lhs._internal op rhs._internal;				\
@@ -212,7 +212,7 @@ namespace Grid {
     inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
     {									\
       return lhs op rhs._internal;					\
-    }									
+    }									\
 
 
 DECLARE_RELATIONAL(<,slt);

From 1be80896048e42c825f81a6bc5a26d79c537f37d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 26 Apr 2018 23:42:45 +0100
Subject: [PATCH 18/28] Clean compile

---
 lib/lattice/Lattice_comparison_utils.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/lattice/Lattice_comparison_utils.h b/lib/lattice/Lattice_comparison_utils.h
index 579449f1..cbac20ec 100644
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -179,7 +179,7 @@ namespace Grid {
       return ret;
     }
 
-#define DECLARE_RELATIONAL(op,functor) \
+#define DECLARE_RELATIONAL_EQ(op,functor) \
   template<class vsimd,IfSimd<vsimd> = 0>\
     inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
     {\
@@ -198,11 +198,6 @@ namespace Grid {
       typedef typename vsimd::scalar_type scalar;\
       return Comparison(functor<scalar,scalar>(),lhs,rhs);\
     }\
-  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
-    {									\
-      return lhs._internal op rhs._internal;				\
-    }									\
   template<class vsimd>\
     inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
     {									\
@@ -214,12 +209,19 @@ namespace Grid {
       return lhs op rhs._internal;					\
     }									\
 
+#define DECLARE_RELATIONAL(op,functor) \
+  DECLARE_RELATIONAL_EQ(op,functor)    \
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									
 
 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
-DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);
 
 #undef DECLARE_RELATIONAL

From 809b1cdd58d33ca43cab460a838589dfd0f2ce78 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 05:19:10 +0100
Subject: [PATCH 19/28] Bug fix for MPI running ; introduced last night

---
 lib/cshift/Cshift_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index b2f87315..24e0d2ab 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -152,7 +152,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
       for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	table[ent++] = std::pair<int,int>(so+o+b,bo);
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
       }
     }
 

From 0734e9ddd4a24e4d7d1ee7224568105cfd39ac5b Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 27 Apr 2018 14:39:01 +0900
Subject: [PATCH 20/28] Debugging Scatter_plane_simple

---
 lib/cshift/Cshift_common.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index b2f87315..c9faf365 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -150,9 +150,9 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
 
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
-	int o   =n*rhs._grid->_slice_stride[dimension];
-	int bo  =n*rhs._grid->_slice_block[dimension];
-	table[ent++] = std::pair<int,int>(so+o+b,bo);
+        int o   =n*rhs._grid->_slice_stride[dimension];
+        int bo  =n*rhs._grid->_slice_block[dimension];
+        table[ent++] = std::pair<int,int>(so+o+b,bo+b);
       }
     }
 

From 75e4483407fe4f9f9715f06ba6e95be4c9eef2b8 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 07:49:57 +0100
Subject: [PATCH 21/28] Stronger convergence test

---
 .../iterative/ImplicitlyRestartedLanczos.h           | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 787cf15a..8011e796 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -479,15 +479,13 @@ until convergence
 	Field B(grid); B.checkerboard = evec[0].checkerboard;
 
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
+	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
-	    if ( j > Nconv ) {
-	      Nconv=j+1;
-	      jj=Nstop; // Terminate the scan
-	    }
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
+	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
@@ -495,8 +493,10 @@ until convergence
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
+	if ( allconv ) Nconv = Nstop;
+
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){

From b27f0e5a539fc59214d026d04143e67d2d5a0264 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 07:50:15 +0100
Subject: [PATCH 22/28] Control over IO

---
 lib/algorithms/iterative/LocalCoherenceLanczos.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h
index b8348c0c..54e4c6c8 100644
--- a/lib/algorithms/iterative/LocalCoherenceLanczos.h
+++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h
@@ -48,6 +48,7 @@ struct LanczosParams : Serializable {
 struct LocalCoherenceLanczosParams : Serializable {
  public:
   GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
+				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,

From 9b0240d1016c071750aefd28dbfbd97b781a8229 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 07:50:51 +0100
Subject: [PATCH 23/28] Hot start test

---
 tests/Test_compressed_lanczos_hot_start.cc | 243 +++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 tests/Test_compressed_lanczos_hot_start.cc

diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc
new file mode 100644
index 00000000..998f1b9f
--- /dev/null
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -0,0 +1,243 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
+
+    Copyright (C) 2017
+
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_FineGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+    this->evals_fine.resize(nbasis);
+    this->subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+    
+    assert(this->evals_fine.size()==nbasis);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
+      
+    }
+    RD.close();
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_CoarseGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+  }
+};
+
+int main (int argc, char ** argv) {
+
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  LocalCoherenceLanczosParams Params;
+  {
+    Params.omega.resize(10);
+    Params.blockSize.resize(5);
+    XmlWriter writer("Params_template.xml");
+    write(writer,"Params",Params);
+    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
+  }
+  
+  { 
+    XmlReader reader(std::string("./Params.xml"));
+    read(reader, "Params", Params);
+  }
+
+  int     Ls = (int)Params.omega.size();
+  RealD mass = Params.mass;
+  RealD M5   = Params.M5;
+  std::vector<int> blockSize = Params.blockSize;
+  std::vector<int> latt({16,16,16,16});
+  uint64_t     vol = Ls*latt[0]*latt[1]*latt[2]*latt[3];
+  double   mat_flop= 2.0*1320.0*vol;    
+  // Grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(latt,
+								     GridDefaultSimd(Nd,vComplex::Nsimd()),
+								     GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> fineLatt     = latt;
+  int dims=fineLatt.size();
+  assert(blockSize.size()==dims+1);
+  std::vector<int> coarseLatt(dims);
+  std::vector<int> coarseLatt5d ;
+
+  for (int d=0;d<coarseLatt.size();d++){
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<coarseLatt.size();i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+
+  // Gauge field
+  std::vector<int> seeds4({1,2,3,4});
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4,Umu);
+  //  FieldMetaData header;
+  //  NerscIO::readConfiguration(Umu,header,Params.config);
+
+  std::cout << GridLogMessage << "Lattice dimensions: " << latt << "   Ls: " << Ls << std::endl;
+
+  // ZMobius EO Operator
+  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
+  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
+
+  // Eigenvector storage
+  LanczosParams fine  =Params.FineParams;  
+  LanczosParams coarse=Params.CoarseParams;  
+
+  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
+  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
+  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
+
+  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
+  assert(Nm2 >= Nm1);
+
+  const int nbasis= 60;
+  assert(nbasis==Ns1);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+
+  assert( (Params.doFine)||(Params.doFineRead));
+
+  if ( Params.doFine ) { 
+    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
+    double t0=-usecond();
+    _LocalCoherenceLanczos.calcFine(fine.Cheby,
+		 fine.Nstop,fine.Nk,fine.Nm,
+		 fine.resid,fine.MaxIt, 
+		 fine.betastp,fine.MinRes);
+    t0+=usecond();
+
+    double t1=-usecond();
+    if ( Params.saveEvecs ) {
+      std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+      _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
+    }
+    t1+=usecond();
+
+    std::cout << GridLogMessage << "Computation time is " << (t0)/1.0e6 <<" seconds"<<std::endl;
+    if ( Params.saveEvecs )  std::cout << GridLogMessage << "I/O         time is " << (t1)/1.0e6 <<" seconds"<<std::endl;
+    std::cout << GridLogMessage << "Time to solution is " << (t1+t0)/1.0e6 <<" seconds"<<std::endl;
+    std::cout << GridLogMessage << "Done"<<std::endl;
+  }
+
+  Grid_finalize();
+}
+

From 9f8d63e10426637c5b2a0b03f9fa441ec388cb35 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 07:51:12 +0100
Subject: [PATCH 24/28] Roll over version

---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index bfad377d..a0211af1 100644
--- a/VERSION
+++ b/VERSION
@@ -1,4 +1,4 @@
-Version : 0.7.0
+Version : 0.8.0
 
 - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
 - MPI and MPI3 comms optimisations for KNL and OPA finished

From e369d7306d8bc53fb1e2456152d444b7e61fda07 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 07:51:44 +0100
Subject: [PATCH 25/28] Rename

---
 tests/lanczos/Test_compressed_lanczos.cc | 253 +++++++++++++++++++++++
 1 file changed, 253 insertions(+)
 create mode 100644 tests/lanczos/Test_compressed_lanczos.cc

diff --git a/tests/lanczos/Test_compressed_lanczos.cc b/tests/lanczos/Test_compressed_lanczos.cc
new file mode 100644
index 00000000..8bce82bb
--- /dev/null
+++ b/tests/lanczos/Test_compressed_lanczos.cc
@@ -0,0 +1,253 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_compressed_lanczos_reorg.cc
+
+    Copyright (C) 2017
+
+Author: Leans heavily on Christoph Lehner's code
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+/*
+ *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
+ *  in Grid that were intended to be used to support blocked Aggregates, from
+ */
+#include <Grid/Grid.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class Fobj,class CComplex,int nbasis>
+class LocalCoherenceLanczosScidac : public LocalCoherenceLanczos<Fobj,CComplex,nbasis>
+{ 
+public:
+  typedef iVector<CComplex,nbasis >           CoarseSiteVector;
+  typedef Lattice<CoarseSiteVector>           CoarseField;
+  typedef Lattice<CComplex>   CoarseScalar; // used for inner products on fine field
+  typedef Lattice<Fobj>          FineField;
+
+  LocalCoherenceLanczosScidac(GridBase *FineGrid,GridBase *CoarseGrid,
+			      LinearOperatorBase<FineField> &FineOp,
+			      int checkerboard) 
+    // Base constructor
+    : LocalCoherenceLanczos<Fobj,CComplex,nbasis>(FineGrid,CoarseGrid,FineOp,checkerboard) 
+  {};
+
+  void checkpointFine(std::string evecs_file,std::string evals_file)
+  {
+    assert(this->subspace.size()==nbasis);
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_FineGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      WR.writeScidacFieldRecord(this->subspace[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_fine);
+  }
+
+  void checkpointFineRestore(std::string evecs_file,std::string evals_file)
+  {
+    this->evals_fine.resize(nbasis);
+    this->subspace.resize(nbasis,this->_FineGrid);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_fine);
+    
+    assert(this->evals_fine.size()==nbasis);
+    
+    std::cout << GridLogIRL<< "checkpointFineRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nbasis;k++) {
+      this->subspace[k].checkerboard=this->_checkerboard;
+      RD.readScidacFieldRecord(this->subspace[k],record);
+      
+    }
+    RD.close();
+  }
+
+  void checkpointCoarse(std::string evecs_file,std::string evals_file)
+  {
+    int n = this->evec_coarse.size();
+    emptyUserRecord record;
+    Grid::QCD::ScidacWriter WR(this->_CoarseGrid->IsBoss());
+    WR.open(evecs_file);
+    for(int k=0;k<n;k++) {
+      WR.writeScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    WR.close();
+    
+    XmlWriter WRx(evals_file);
+    write(WRx,"evals",this->evals_coarse);
+  }
+
+  void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
+  {
+    std::cout << "resizing coarse vecs to " << nvec<< std::endl;
+    this->evals_coarse.resize(nvec);
+    this->evec_coarse.resize(nvec,this->_CoarseGrid);
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evals from "<<evals_file<<std::endl;
+    XmlReader RDx(evals_file);
+    read(RDx,"evals",this->evals_coarse);
+
+    assert(this->evals_coarse.size()==nvec);
+    emptyUserRecord record;
+    std::cout << GridLogIRL<< "checkpointCoarseRestore:  Reading evecs from "<<evecs_file<<std::endl;
+    Grid::QCD::ScidacReader RD ;
+    RD.open(evecs_file);
+    for(int k=0;k<nvec;k++) {
+      RD.readScidacFieldRecord(this->evec_coarse[k],record);
+    }
+    RD.close();
+  }
+};
+
+int main (int argc, char ** argv) {
+
+  Grid_init(&argc,&argv);
+  GridLogIRL.TimingMode(1);
+
+  LocalCoherenceLanczosParams Params;
+  {
+    Params.omega.resize(10);
+    Params.blockSize.resize(5);
+    XmlWriter writer("Params_template.xml");
+    write(writer,"Params",Params);
+    std::cout << GridLogMessage << " Written Params_template.xml" <<std::endl;
+  }
+  
+  { 
+    XmlReader reader(std::string("./Params.xml"));
+    read(reader, "Params", Params);
+  }
+
+  int     Ls = (int)Params.omega.size();
+  RealD mass = Params.mass;
+  RealD M5   = Params.M5;
+  std::vector<int> blockSize = Params.blockSize;
+
+  // Grids
+  GridCartesian         * UGrid     = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								     GridDefaultSimd(Nd,vComplex::Nsimd()),
+								     GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid   = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> fineLatt     = GridDefaultLatt();
+  int dims=fineLatt.size();
+  assert(blockSize.size()==dims+1);
+  std::vector<int> coarseLatt(dims);
+  std::vector<int> coarseLatt5d ;
+
+  for (int d=0;d<coarseLatt.size();d++){
+    coarseLatt[d] = fineLatt[d]/blockSize[d];    assert(coarseLatt[d]*blockSize[d]==fineLatt[d]);
+  }
+
+  std::cout << GridLogMessage<< " 5d coarse lattice is ";
+  for (int i=0;i<coarseLatt.size();i++){
+    std::cout << coarseLatt[i]<<"x";
+  } 
+  int cLs = Ls/blockSize[dims]; assert(cLs*blockSize[dims]==Ls);
+  std::cout << cLs<<std::endl;
+  
+  GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
+  GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
+
+  // Gauge field
+  LatticeGaugeField Umu(UGrid);
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,Params.config);
+  std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << "   Ls: " << Ls << std::endl;
+
+  // ZMobius EO Operator
+  ZMobiusFermionR Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5, Params.omega,1.,0.);
+  SchurDiagTwoOperator<ZMobiusFermionR,LatticeFermion> HermOp(Ddwf);
+
+  // Eigenvector storage
+  LanczosParams fine  =Params.FineParams;  
+  LanczosParams coarse=Params.CoarseParams;  
+
+  const int Ns1 = fine.Nstop;   const int Ns2 = coarse.Nstop;
+  const int Nk1 = fine.Nk;      const int Nk2 = coarse.Nk;
+  const int Nm1 = fine.Nm;      const int Nm2 = coarse.Nm;
+
+  std::cout << GridLogMessage << "Keep " << fine.Nstop   << " fine   vectors" << std::endl;
+  std::cout << GridLogMessage << "Keep " << coarse.Nstop << " coarse vectors" << std::endl;
+  assert(Nm2 >= Nm1);
+
+  const int nbasis= 60;
+  assert(nbasis==Ns1);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
+  std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
+
+  assert( (Params.doFine)||(Params.doFineRead));
+
+  if ( Params.doFine ) { 
+    std::cout << GridLogMessage << "Performing fine grid IRL Nstop "<< Ns1 << " Nk "<<Nk1<<" Nm "<<Nm1<< std::endl;
+    _LocalCoherenceLanczos.calcFine(fine.Cheby,
+		 fine.Nstop,fine.Nk,fine.Nm,
+		 fine.resid,fine.MaxIt, 
+		 fine.betastp,fine.MinRes);
+
+    std::cout << GridLogIRL<<"Checkpointing Fine evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointFine(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    std::cout << GridLogIRL<<"Orthogonalising"<<std::endl;
+    _LocalCoherenceLanczos.Orthogonalise();
+    std::cout << GridLogIRL<<"Orthogonaled"<<std::endl;
+  }
+
+  if ( Params.doFineRead ) { 
+    _LocalCoherenceLanczos.checkpointFineRestore(std::string("evecs.scidac"),std::string("evals.xml"));
+    _LocalCoherenceLanczos.testFine(fine.resid*100.0); // Coarse check
+    _LocalCoherenceLanczos.Orthogonalise();
+  }
+
+  if ( Params.doCoarse ) {
+    std::cout << GridLogMessage << "Performing coarse grid IRL Nstop "<< Ns2<< " Nk "<<Nk2<<" Nm "<<Nm2<< std::endl;
+    _LocalCoherenceLanczos.calcCoarse(coarse.Cheby,Params.Smoother,Params.coarse_relax_tol,
+			      coarse.Nstop, coarse.Nk,coarse.Nm,
+			      coarse.resid, coarse.MaxIt, 
+			      coarse.betastp,coarse.MinRes);
+
+
+    std::cout << GridLogIRL<<"Checkpointing coarse evecs"<<std::endl;
+    _LocalCoherenceLanczos.checkpointCoarse(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"));
+  }
+
+  if ( Params.doCoarseRead ) {
+    // Verify we can reread ???
+    _LocalCoherenceLanczos.checkpointCoarseRestore(std::string("evecs.coarse.scidac"),std::string("evals.coarse.xml"),coarse.Nstop);
+    _LocalCoherenceLanczos.testCoarse(coarse.resid*100.0,Params.Smoother,Params.coarse_relax_tol); // Coarse check
+  }
+  Grid_finalize();
+}
+

From fe6860b4dd332dfc618c9d139c54f8bc684a0b2d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 27 Apr 2018 08:57:34 +0100
Subject: [PATCH 26/28] Update with LIME library guard

---
 tests/Test_compressed_lanczos_hot_start.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc
index 998f1b9f..3276d0f8 100644
--- a/tests/Test_compressed_lanczos_hot_start.cc
+++ b/tests/Test_compressed_lanczos_hot_start.cc
@@ -56,6 +56,7 @@ public:
 
   void checkpointFine(std::string evecs_file,std::string evals_file)
   {
+#ifdef HAVE_LIME
     assert(this->subspace.size()==nbasis);
     emptyUserRecord record;
     Grid::QCD::ScidacWriter WR(this->_FineGrid->IsBoss());
@@ -67,10 +68,14 @@ public:
     
     XmlWriter WRx(evals_file);
     write(WRx,"evals",this->evals_fine);
+#else
+    assert(0);
+#endif
   }
 
   void checkpointFineRestore(std::string evecs_file,std::string evals_file)
   {
+#ifdef HAVE_LIME
     this->evals_fine.resize(nbasis);
     this->subspace.resize(nbasis,this->_FineGrid);
     
@@ -90,10 +95,14 @@ public:
       
     }
     RD.close();
+#else
+    assert(0);
+#endif 
   }
 
   void checkpointCoarse(std::string evecs_file,std::string evals_file)
   {
+#ifdef HAVE_LIME
     int n = this->evec_coarse.size();
     emptyUserRecord record;
     Grid::QCD::ScidacWriter WR(this->_CoarseGrid->IsBoss());
@@ -105,10 +114,14 @@ public:
     
     XmlWriter WRx(evals_file);
     write(WRx,"evals",this->evals_coarse);
+#else
+    assert(0);
+#endif
   }
 
   void checkpointCoarseRestore(std::string evecs_file,std::string evals_file,int nvec)
   {
+#ifdef HAVE_LIME
     std::cout << "resizing coarse vecs to " << nvec<< std::endl;
     this->evals_coarse.resize(nvec);
     this->evec_coarse.resize(nvec,this->_CoarseGrid);
@@ -125,6 +138,9 @@ public:
       RD.readScidacFieldRecord(this->evec_coarse[k],record);
     }
     RD.close();
+#else 
+    assert(0);
+#endif
   }
 };
 

From 1dddd17e3c73c37bf769ad31532e44cbb0cdc585 Mon Sep 17 00:00:00 2001
From: Dr Peter Boyle <paboyle@tesseract-login1.ib0.sgi.cluster.dirac.ed.ac.uk>
Date: Fri, 27 Apr 2018 11:44:46 +0100
Subject: [PATCH 27/28] Benchmark improvements from tesseract

---
 benchmarks/Benchmark_ITT.cc   | 13 ++++++++++---
 benchmarks/Benchmark_comms.cc | 13 +++++++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 666e4830..f811ac32 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -158,8 +158,10 @@ public:
 
 	  dbytes=0;
 	  ncomm=0;
-
-	  parallel_for(int dir=0;dir<8;dir++){
+#ifdef GRID_OMP
+#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
+#endif
+	  for(int dir=0;dir<8;dir++){
 
 	    double tbytes;
 	    int mu =dir % 4;
@@ -175,9 +177,14 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
+#ifdef GRID_OMP
+	int tid = omp_get_thread_num(); 
+#else 
+        int tid = dir;
+#endif
 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 						 (void *)&rbuf[dir][0], recv_from_rank,
-						 bytes,dir);
+						 bytes,tid);
 	  
 #ifdef GRID_OMP
 #pragma omp atomic
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 29ccf96c..304a09fc 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -169,7 +169,11 @@ int main (int argc, char ** argv)
   for(int lat=4;lat<=maxlat;lat+=4){
     for(int Ls=8;Ls<=8;Ls*=2){
 
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+                                    lat*mpi_layout[1],
+                                    lat*mpi_layout[2],
+                                    lat*mpi_layout[3]});
+
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
       RealD Nrank = Grid._Nprocessors;
@@ -485,7 +489,8 @@ int main (int argc, char ** argv)
 	dbytes=0;
 	ncomm=0;
 
-	parallel_for(int dir=0;dir<8;dir++){
+#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
+	for(int dir=0;dir<8;dir++){
 
 	  double tbytes;
 	  int mu =dir % 4;
@@ -502,9 +507,9 @@ int main (int argc, char ** argv)
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
-
+            int tid = omp_get_thread_num();
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+					       (void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
 
 #pragma omp atomic
 	    dbytes+=tbytes;

From edc28dcfbf0df697e78bb8718e3ad4980fe885ad Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 1 May 2018 14:02:31 +0100
Subject: [PATCH 28/28] Hadrons: scalar SU(N) 2-pt fix

---
 .../Hadrons/Modules/MScalarSUN/TwoPoint.hpp   | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
index abca6212..1496edf9 100644
--- a/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
+++ b/extras/Hadrons/Modules/MScalarSUN/TwoPoint.hpp
@@ -64,9 +64,9 @@ template <typename SImpl>
 class TTwoPoint: public Module<TwoPointPar>
 {
 public:
-    typedef typename SImpl::Field          Field;
-    typedef typename SImpl::ComplexField   ComplexField;
-    typedef          std::vector<TComplex> SlicedOp;
+    typedef typename SImpl::Field         Field;
+    typedef typename SImpl::ComplexField  ComplexField;
+    typedef          std::vector<Complex> SlicedOp;
 public:
     // constructor
     TTwoPoint(const std::string name);
@@ -160,18 +160,24 @@ void TTwoPoint<SImpl>::execute(void)
         LOG(Message) << "  <" << p.first << " " << p.second << ">" << std::endl;
     }
 
-    const unsigned int                           nd   = env().getDim().size();
-    const unsigned int                           nt   = env().getDim().back();
-    const unsigned int                           nop  = par().op.size();
-    const unsigned int                           nmom = mom_.size();
+    const unsigned int                           nd      = env().getNd();
+    const unsigned int                           nt      = env().getDim().back();
+    const unsigned int                           nop     = par().op.size();
+    const unsigned int                           nmom    = mom_.size();
+    double                                       partVol = 1.;
     std::vector<int>                             dMask(nd, 1);
     std::set<std::string>                        ops;
     std::vector<TwoPointResult>                  result;
     std::map<std::string, std::vector<SlicedOp>> slicedOp;
     FFT                                          fft(env().getGrid());
+    TComplex                                     buf;
 
     envGetTmp(ComplexField, ftBuf);
     dMask[nd - 1] = 0;
+    for (unsigned int mu = 0; mu < nd - 1; ++mu)
+    {
+        partVol *= env().getDim()[mu];
+    }
     for (auto &p: par().op)
     {
         ops.insert(p.first);
@@ -183,7 +189,7 @@ void TTwoPoint<SImpl>::execute(void)
 
         slicedOp[o].resize(nmom);
         LOG(Message) << "Operator '" << o << "' FFT" << std::endl;
-        fft.FFT_dim_mask(ftBuf, op, dMask, FFT::backward);
+        fft.FFT_dim_mask(ftBuf, op, dMask, FFT::forward);
         for (unsigned int m = 0; m < nmom; ++m)
         {
             auto qt = mom_[m];
@@ -193,7 +199,8 @@ void TTwoPoint<SImpl>::execute(void)
             for (unsigned int t = 0; t < nt; ++t)
             {
                 qt[nd - 1] = t;
-                peekSite(slicedOp[o][m][t], ftBuf, qt);
+                peekSite(buf, ftBuf, qt);
+                slicedOp[o][m][t] = TensorRemove(buf)/partVol;
             }
         }
     }
@@ -228,7 +235,7 @@ std::vector<Complex> TTwoPoint<SImpl>::makeTwoPoint(
     {
         for (unsigned int t  = 0; t < nt; ++t)
         {
-            res[dt] += TensorRemove(trace(sink[(t+dt)%nt]*adj(source[t])));
+            res[dt] += sink[(t+dt)%nt]*adj(source[t]);
         }
         res[dt] *= 1./static_cast<double>(nt);
     }