From e1042aef77f70f2c4ab44d9066fd4c6d093f6e50 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 28 Oct 2016 17:20:04 +0100
Subject: [PATCH 01/17] First version of the doube prec for testing purposes

It does not compile single and double version at the same time
---
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    | 81 ++++++++++++++++++-
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h |  4 +-
 lib/simd/Grid_avx512.h                        |  6 --
 3 files changed, 80 insertions(+), 11 deletions(-)
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 74862400..2fc9b035 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -53,12 +53,13 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 }
 
 #if defined(AVX512) 
-    
+#include <simd/Intel512wilson.h>
+
+#if defined(GRID_DEFAULT_PRECISION_SINGLE)    
     ///////////////////////////////////////////////////////////
     // If we are AVX512 specialise the single precision routine
     ///////////////////////////////////////////////////////////
-    
-#include <simd/Intel512wilson.h>
+
 #include <simd/Intel512single.h>
     
 static Vector<vComplexF> signs;
@@ -78,6 +79,7 @@ static Vector<vComplexF> signs;
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexF
   
 #undef KERNEL_DAG
 template<> void 
@@ -113,8 +115,79 @@ template<> void
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#undef COMPLEX_TYPE
+	
+#endif //Single precision			    
+
+#if defined(GRID_DEFAULT_PRECISION_DOUBLE)    
+//temporary separating the two sections
+//for debug in isolation
+//can be unified
+
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the double precision routine
+    ///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signs;
+    
+  int setupSigns(void ){
+    Vector<vComplexD> bother(2);
+    signs = bother;
+    vrsign(signs[0]);
+    visign(signs[1]);
+    return 1;
+  }
+  static int signInit = setupSigns();
+  
+#define label(A)  ilabel(A)
+#define ilabel(A) ".globl\n"  #A ":\n" 
+  
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexD
+  
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
-#endif
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
+#define MAYBEPERM(A,B) 
+#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+	
+#undef COMPLEX_TYPE
+#endif //Double precision			    
+
+#endif //AVX512
 
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index 12579d8c..72e13754 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -5,7 +5,9 @@
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
+  //on the chosen precision
+  COMPLEX_TYPE *isigns = &signs[0];
 
   MASK_REGS;
   int nmax=U._grid->oSites();
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 62789462..136c940e 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -371,14 +371,8 @@ namespace Optimization {
   // Some Template specialization
 
   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-<<<<<<< HEAD
-#define GNU_CLANG_COMPILER 
-#ifdef GNU_CLANG_COMPILER
-=======
-
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
->>>>>>> develop
   //Complex float Reduce
   template<>
     inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){

From 9b066e94d00c7190d9bf0531caa8964a4e79cf1f Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sun, 30 Oct 2016 12:04:06 +0000
Subject: [PATCH 02/17] Compilation with both single and double precision

---
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 55 ++++++++++------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 2fc9b035..8bd55d61 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -10,6 +10,7 @@
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -55,23 +56,24 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 #if defined(AVX512) 
 #include <simd/Intel512wilson.h>
 
-#if defined(GRID_DEFAULT_PRECISION_SINGLE)    
     ///////////////////////////////////////////////////////////
     // If we are AVX512 specialise the single precision routine
     ///////////////////////////////////////////////////////////
 
 #include <simd/Intel512single.h>
     
-static Vector<vComplexF> signs;
-    
-  int setupSigns(void ){
-    Vector<vComplexF> bother(2);
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
     signs = bother;
     vrsign(signs[0]);
     visign(signs[1]);
     return 1;
   }
-  static int signInit = setupSigns();
+
+  static int signInitF = setupSigns(signsF);
   
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -80,6 +82,7 @@ static Vector<vComplexF> signs;
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
 #define COMPLEX_TYPE vComplexF
+#define signs signsF
   
 #undef KERNEL_DAG
 template<> void 
@@ -116,34 +119,22 @@ WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
 	
-#endif //Single precision			    
-
-#if defined(GRID_DEFAULT_PRECISION_DOUBLE)    
-//temporary separating the two sections
-//for debug in isolation
-//can be unified
-
-    ///////////////////////////////////////////////////////////
-    // If we are AVX512 specialise the double precision routine
-    ///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
 
 #include <simd/Intel512double.h>
     
-static Vector<vComplexD> signs;
+static Vector<vComplexD> signsD;
+#define signs signsD
+static int signInitD = setupSigns(signsD);
     
-  int setupSigns(void ){
-    Vector<vComplexD> bother(2);
-    signs = bother;
-    vrsign(signs[0]);
-    visign(signs[1]);
-    return 1;
-  }
-  static int signInit = setupSigns();
-  
-#define label(A)  ilabel(A)
-#define ilabel(A) ".globl\n"  #A ":\n" 
-  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
@@ -185,7 +176,11 @@ WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 	
 #undef COMPLEX_TYPE
-#endif //Double precision			    
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
 
 #endif //AVX512
 

From e8c3174ae28746c8f24377ac7a4fbddc7d9a3831 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sun, 30 Oct 2016 12:23:11 +0000
Subject: [PATCH 03/17] Small change in the defines

---
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 8bd55d61..83124d1a 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -159,8 +159,8 @@ WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG

From ae8561892ef8e6763454a552d9aba45fa33074c6 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Wed, 2 Nov 2016 10:21:06 +0000
Subject: [PATCH 04/17] Eliminating useless defines

---
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 83124d1a..d7a9edd3 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -103,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
@@ -159,8 +159,8 @@ WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG

From afc8d3e524b6ceff65c415ad05b41384fc420e2a Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 7 Nov 2016 11:13:43 +0000
Subject: [PATCH 05/17] Adding support for parallel recursive compilation for
 the tests

---
 Makefile.am       | 2 +-
 tests/Makefile.am | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 049220e8..2606b88c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,7 +4,7 @@ SUBDIRS = lib benchmarks tests
 .PHONY: tests
 
 tests:
-	make -C tests tests
+	$(MAKE) -C tests tests
 
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c98bc2d0..2e7c1f0a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -9,4 +9,4 @@ endif
 include Make.inc
 
 subtests:
-	for d in $(SUBDIRS); do make -C $${d} tests; done
+	for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done

From 0cff8754d1b8e2c46b972dcc02b0e80069c2bcd7 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 8 Nov 2016 11:35:41 +0000
Subject: [PATCH 06/17] Usecs

---
 lib/FFT.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index fda43eb8..52d08cbe 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -122,7 +122,8 @@ namespace Grid {
     
     double Flops(void) {return flops;}
     double MFlops(void) {return flops/usec;}
-    
+    double USec(void)   {return (double)usec;}    
+
     FFT ( GridCartesian * grid ) :
     vgrid(grid),
     Nd(grid->_ndimension),

From 3dc2e05d6edb5acd0582293017d4a7fe5753297d Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 8 Nov 2016 11:36:18 +0000
Subject: [PATCH 07/17] Time as well since MKL returns zero for Mflops

---
 tests/core/Test_fftf.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc
index 54b1ddb3..4eb4398d 100644
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
 
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 
   SpinMatrixF Sp; 
   Sp = zero; Sp = Sp+cVol;

From f85b35314dafe12ba6a497c46d0b2c0bbbc373a2 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 8 Nov 2016 11:49:13 +0000
Subject: [PATCH 08/17] Fix a routine for single node processor coor from rank

---
 lib/communicator/Communicator_none.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index 0f43f1f5..5e91b305 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   source =0;

From 343f3e829f0d8cd4cb5913a150db3039bec7437d Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 8 Nov 2016 13:42:12 +0000
Subject: [PATCH 09/17] Fixes prerelease to make all tests

---
 Makefile.am                 |  2 +-
 tests/Makefile.am           |  2 +-
 tests/core/Test_fft_gfix.cc | 14 +++++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 049220e8..2606b88c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -4,7 +4,7 @@ SUBDIRS = lib benchmarks tests
 .PHONY: tests
 
 tests:
-	make -C tests tests
+	$(MAKE) -C tests tests
 
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c98bc2d0..2e7c1f0a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -9,4 +9,4 @@ endif
 include Make.inc
 
 subtests:
-	for d in $(SUBDIRS); do make -C $${d} tests; done
+	for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 6a2868b0..d5779726 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -120,7 +120,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     LatticeComplex  Fp(grid);
     LatticeComplex  psq(grid); psq=zero;
     LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = ComplexD(1.0,0.0);
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);
 
     GaugeMat g(grid);
     GaugeMat dmuAmu_p(grid);
@@ -261,25 +261,25 @@ int main (int argc, char ** argv)
   std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge    *" <<std::endl;
   std::cout<< "*****************************************************************" <<std::endl;
 
-  LatticeGaugeFieldD   Umu(&GRID);
-  LatticeGaugeFieldD   Uorg(&GRID);
-  LatticeColourMatrixD   g(&GRID); // Gauge xform
+  LatticeGaugeField   Umu(&GRID);
+  LatticeGaugeField   Uorg(&GRID);
+  LatticeColourMatrix   g(&GRID); // Gauge xform
 
   
   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
   Uorg=Umu;
 
   SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
-  RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  RealD plaq=WilsonLoops<PeriodicGimplF>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
 
 
 
   RealD alpha=0.1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
+  FourierAcceleratedGaugeFixer<PeriodicGimplF>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
 
 
-  plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  plaq=WilsonLoops<PeriodicGimplF>::avgPlaquette(Umu);
   std::cout << " Final plaquette "<<plaq << std::endl;
 
   Uorg = Uorg - Umu;

From c5a025d421cb30f9ff114bf23a53b60195427ae6 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 14:07:59 +0000
Subject: [PATCH 10/17] README typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 49f08237..f4a376f1 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`.
 
 ### Possible communication interfaces
 
-The following options can be use with the `--enable-simd=` option to target different communication interfaces:
+The following options can be use with the `--enable-comms=` option to target different communication interfaces:
 
 | `<comm>`       | Description                                                   |
 | -------------- | ------------------------------------------------------------- |

From f6e1a5b348a107e58e8bb9a94a2323047630a473 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 14:08:33 +0000
Subject: [PATCH 11/17] building tests depends on building the library at the
 top level

---
 Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index 2606b88c..18b3ddc3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,7 +3,7 @@ SUBDIRS = lib benchmarks tests
 
 .PHONY: tests
 
-tests:
+tests: all
 	$(MAKE) -C tests tests
 
 AM_CXXFLAGS += -I$(top_builddir)/include

From a26adfb0908ebe4c72c977b78e0b803742c06a2e Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 14:11:18 +0000
Subject: [PATCH 12/17] README: only markdown

---
 README | 44 --------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 README

diff --git a/README b/README
deleted file mode 100644
index 17e92fa0..00000000
--- a/README
+++ /dev/null
@@ -1,44 +0,0 @@
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
-
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shapped arrays are called conformable arrays.
-
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
-
-The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes.
-
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
-
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
-
-These are presented as 
-
-  vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-  RealF, RealD, ComplexF, ComplexD
-
-MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is are examples:
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
-
-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-     

From 3d2a22a14d9a74bf3dd7788a41d69adb3759b722 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 15:31:47 +0000
Subject: [PATCH 13/17] include fix for MKL

---
 lib/FFT.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index fda43eb8..c1cd1980 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -29,9 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 
-#ifdef HAVE_FFTW	
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include <fftw/fftw3.h>
+#else
 #include <fftw3.h>
 #endif
+#endif
 
 
 namespace Grid {

From cd0be8cb24f29afe2a09010d7d87524af13aa543 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 15:32:05 +0000
Subject: [PATCH 14/17] Test_fft_gfix.c precision fix

---
 tests/core/Test_fft_gfix.cc | 68 ++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 6a2868b0..c6b77a13 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
   static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
     for(int mu=0;mu<Nd;mu++){
 //      ImplComplex cmi(0.0,-1.0);
-      ComplexD cmi(0.0,-1.0);
+      Complex cmi(0.0,-1.0);
       A[mu] = Ta(U[mu]) * cmi;
     }
   }
@@ -52,13 +52,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
     }
   }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,RealD & alpha,int maxiter,RealD Omega_tol, RealD Phi_tol) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol) {
     GridBase *grid = Umu._grid;
 
-    RealD org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-    RealD org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-    RealD old_trace = org_link_trace;
-    RealD trG;
+    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    Real old_trace = org_link_trace;
+    Real trG;
 
     std::vector<GaugeMat> U(Nd,grid);
                  GaugeMat dmuAmu(grid);
@@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       // Monitor progress and convergence test 
       // infrequently to minimise cost overhead
       if ( i %20 == 0 ) { 
-	RealD plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-	RealD link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
 
 	std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	
-	RealD Phi  = 1.0 - old_trace / link_trace ;
-	RealD Omega= 1.0 - trG;
+	Real Phi  = 1.0 - old_trace / link_trace ;
+	Real Omega= 1.0 - trG;
 
 
 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
@@ -91,7 +91,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       }
     }
   };
-  static RealD SteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
     GridBase *grid = U[0]._grid;
 
     std::vector<GaugeMat> A(Nd,grid);
@@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
 
 
-    RealD vol = grid->gSites();
-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real vol = grid->gSites();
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
 
     SU<Nc>::GaugeTransform(U,g);
 
     return trG;
   }
 
-  static RealD FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
 
     GridBase *grid = U[0]._grid;
 
-    RealD vol = grid->gSites();
+    Real vol = grid->gSites();
 
     FFT theFFT((GridCartesian *)grid);
 
     LatticeComplex  Fp(grid);
     LatticeComplex  psq(grid); psq=zero;
     LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = ComplexD(1.0,0.0);
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);
 
     GaugeMat g(grid);
     GaugeMat dmuAmu_p(grid);
@@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     std::vector<int> coor(grid->_ndimension,0);
     for(int mu=0;mu<Nd;mu++) {
 
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
     }
 
-    ComplexD psqMax(16.0);
+    Complex psqMax(16.0);
     Fp =  psqMax*one/psq;
 
     static int once;
@@ -160,20 +160,20 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
 
     GaugeMat ciadmam(grid);
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
     ciadmam = dmuAmu*cialpha;
     SU<Nc>::taExp(ciadmam,g);
 
-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
 
     SU<Nc>::GaugeTransform(U,g);
 
     return trG;
   }
 
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
     GridBase *grid = g._grid;
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
     GaugeMat ciadmam(grid);
     DmuAmu(A,dmuAmu);
     ciadmam = dmuAmu*cialpha;
@@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     ComplexField  pha(grid);
     GaugeMat      Apha(grid);
 
-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);
 
     for(int mu=0;mu<Nd;mu++){
 
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       pha = exp(pmu *  (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
@@ -213,14 +213,14 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 
     ComplexField  pmu(grid);
     ComplexField  pha(grid);
-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);
     
     // Sign convention for FFTW calls:
     // A(x)= Sum_p e^ipx A(p) / V
     // A(p)= Sum_p e^-ipx A(x)
 
     for(int mu=0;mu<Nd;mu++){
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       pha = exp(-pmu *  (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
@@ -241,7 +241,7 @@ int main (int argc, char ** argv)
   int threads = GridThread::GetThreads();
 
   std::vector<int> latt_size   = GridDefaultLatt();
-  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
   int vol = 1;
@@ -261,25 +261,25 @@ int main (int argc, char ** argv)
   std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge    *" <<std::endl;
   std::cout<< "*****************************************************************" <<std::endl;
 
-  LatticeGaugeFieldD   Umu(&GRID);
-  LatticeGaugeFieldD   Uorg(&GRID);
-  LatticeColourMatrixD   g(&GRID); // Gauge xform
+  LatticeGaugeField   Umu(&GRID);
+  LatticeGaugeField   Uorg(&GRID);
+  LatticeColourMatrix   g(&GRID); // Gauge xform
 
   
   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
   Uorg=Umu;
 
   SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
-  RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
 
 
 
-  RealD alpha=0.1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
+  Real alpha=0.1;
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
 
 
-  plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Final plaquette "<<plaq << std::endl;
 
   Uorg = Uorg - Umu;

From 34cf702b24b34379ac5aa8d355fdad83fbf7b78f Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 17:00:38 +0000
Subject: [PATCH 15/17] README is now a symlink to README.md

---
 README | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 README

diff --git a/README b/README
new file mode 120000
index 00000000..42061c01
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+README.md
\ No newline at end of file

From 9576f0903dd0c9f3c44fbedf6fe5d14bfc22d798 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 8 Nov 2016 19:07:47 +0000
Subject: [PATCH 16/17] namespace fix

---
 lib/PerfCount.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/PerfCount.h b/lib/PerfCount.h
index 9ac58883..5ab07c02 100644
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif
 
 namespace Grid {
 
@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
    return tmp;
 }
 #elif defined __x86_64__
-#include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
   return __rdtsc();
   //  unsigned int dummy;

From 33dc1f51b51a08a0b79b65276173d8c0b03fc582 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 9 Nov 2016 04:11:03 -0800
Subject: [PATCH 17/17] Final  sign off commits from Cori-1

---
 benchmarks/Benchmark_comms.cc                | 86 ++++++++++++++++++++
 benchmarks/Benchmark_dwf.cc                  | 37 +++++----
 benchmarks/Benchmark_dwf_sweep.cc            |  4 +-
 lib/Init.cc                                  |  2 +-
 lib/Log.cc                                   |  2 +-
 lib/algorithms/iterative/ConjugateGradient.h |  2 +-
 lib/qcd/hmc/HmcRunner.h                      |  2 +-
 tests/hmc/Test_hmc_WilsonFermionGauge.cc     |  2 +-
 8 files changed, 115 insertions(+), 22 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index de73bc81..969a2a42 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -193,6 +193,7 @@ int main (int argc, char ** argv)
     }
   }  
 
+
   Nloop=100;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
@@ -271,5 +272,90 @@ int main (int argc, char ** argv)
       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
     }
   }    
+
+
+
+  Nloop=100;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+  for(int lat=4;lat<=maxlat;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+
+	ncomm=0;
+	for(int mu=0;mu<4;mu++){
+	
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu][0],
+					    recv_from_rank,
+					    bytes);
+	    //	    Grid.StencilSendToRecvFromComplete(requests);
+	    //	    requests.resize(0);
+
+	    comm_proc = mpi_layout[mu]-1;
+	  
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
+					    bytes);
+	    Grid.StencilSendToRecvFromComplete(requests);
+	    requests.resize(0);
+	  
+	  }
+	}
+	Grid.Barrier();
+
+      }
+      double stop=usecond();
+
+      double dbytes    = bytes;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      double time = stop-start; // microseconds
+
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+    }
+  }    
+
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 5d42a1f9..10e4521b 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)
 
   int ncall =100;
   if (1) {
-
+    FGrid->Barrier();
     Dw.ZeroCounters();
     double t0=usecond();
     for(int i=0;i<ncall;i++){
@@ -147,6 +147,7 @@ int main (int argc, char ** argv)
       __SSC_STOP;
     }
     double t1=usecond();
+    FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=1344*volume*ncall;
@@ -158,7 +159,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
     err = ref-result; 
     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    assert (norm2(err)< 1.0e-5 );
+    assert (norm2(err)< 1.0e-4 );
     Dw.Report();
   }
 
@@ -193,6 +194,7 @@ int main (int argc, char ** argv)
       pokeSite(tmp,ssrc,site);
     }}}}}
     std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
+    FGrid->Barrier();
     double t0=usecond();
     sDw.ZeroCounters();
     for(int i=0;i<ncall;i++){
@@ -201,6 +203,7 @@ int main (int argc, char ** argv)
       __SSC_STOP;
     }
     double t1=usecond();
+    FGrid->Barrier();
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=1344*volume*ncall;
 
@@ -211,12 +214,12 @@ int main (int argc, char ** argv)
   
     if(0){
       for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-  sDw.Dhop(ssrc,sresult,0);
-  PerformanceCounter Counter(i);
-  Counter.Start();
-  sDw.Dhop(ssrc,sresult,0);
-  Counter.Stop();
-  Counter.Report();
+	sDw.Dhop(ssrc,sresult,0);
+	PerformanceCounter Counter(i);
+	Counter.Start();
+	sDw.Dhop(ssrc,sresult,0);
+	Counter.Stop();
+	Counter.Report();
       }
     }
 
@@ -240,7 +243,7 @@ int main (int argc, char ** argv)
       }
     }}}}}
     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
-    assert (sum< 1.0e-5 );
+    assert (sum< 1.0e-4 );
 
 
     if (1) {
@@ -271,6 +274,7 @@ int main (int argc, char ** argv)
       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
       std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 
+      FGrid->Barrier();
       sDw.ZeroCounters();
       sDw.stat.init("DhopEO");
       double t0=usecond();
@@ -278,6 +282,7 @@ int main (int argc, char ** argv)
         sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
       }
       double t1=usecond();
+      FGrid->Barrier();
       sDw.stat.print();
 
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
@@ -301,7 +306,7 @@ int main (int argc, char ** argv)
 
       error+= norm2(ssrc_o);
       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
-      if(error>1.0e-5) { 
+      if(error>1.0e-4) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
@@ -337,7 +342,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
   err = ref-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);
   LatticeFermion src_e (FrbGrid);
   LatticeFermion src_o (FrbGrid);
   LatticeFermion r_e   (FrbGrid);
@@ -363,11 +368,13 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
   {
     Dw.ZeroCounters();
+    FGrid->Barrier();
     double t0=usecond();
     for(int i=0;i<ncall;i++){
       Dw.DhopEO(src_o,r_e,DaggerNo);
     }
     double t1=usecond();
+    FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=(1344.0*volume*ncall)/2;
@@ -389,14 +396,14 @@ int main (int argc, char ** argv)
 
   err = r_eo-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);
 
   pickCheckerboard(Even,src_e,err);
   pickCheckerboard(Odd,src_o,err);
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  assert(norm2(src_e)<1.0e-5);
-  assert(norm2(src_o)<1.0e-5);
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
 
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index a3803b87..21db823d 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -69,8 +69,8 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 
-  int Lmax=32;
-  int dmin=0;
+  int Lmax=16;
+  int dmin=2;
   if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
   if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
   for (int L=8;L<=Lmax;L*=2){
diff --git a/lib/Init.cc b/lib/Init.cc
index b5ef0a95..d6d6b9f8 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -369,7 +369,7 @@ void Grid_init(int *argc,char ***argv)
   
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif
diff --git a/lib/Log.cc b/lib/Log.cc
index 733bfc58..7521657b 100644
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -93,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
   int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
   MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index f340eb38..cf3872c8 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -154,7 +154,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
                   << LinalgTimer.Elapsed();
         std::cout << std::endl;
 
-        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 
         return;
       }
diff --git a/lib/qcd/hmc/HmcRunner.h b/lib/qcd/hmc/HmcRunner.h
index a31ba784..53b127cf 100644
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate {
     NoSmearing<Gimpl> SmearingPolicy;
     typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy >
         IntegratorType;  // change here to change the algorithm
-    IntegratorParameters MDpar(20, 1.0);
+    IntegratorParameters MDpar(40, 1.0);
     IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
 
     // Checkpoint strategy
diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge.cc b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
index 9dcf6343..351d1e68 100644
--- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
@@ -68,7 +68,7 @@ class HmcRunner : public NerscHmcRunner {
     TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
 
     // Set smearing (true/false), default: false
-    Nf2.is_smeared = true;
+    Nf2.is_smeared = false;
 
     // Collect actions
     ActionLevel<LatticeGaugeField> Level1(1);