From a2cffb030442cca679038f4655348df4260ab5de Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Mon, 21 Nov 2016 17:47:18 +0100
Subject: [PATCH 01/14] AVXFMA target fixed

---
 configure.ac                 |  4 ++--
 lib/simd/Grid_avx.h          | 18 +++++++++---------
 lib/simd/Grid_vector_types.h |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/configure.ac b/configure.ac
index a6658a96..90764cb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in
         AC_DEFINE([AVX1],[1],[AVX intrinsics])
         SIMD_FLAGS='-mavx -xavx';;
       AVXFMA)
-        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -mfma';;
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+        SIMD_FLAGS='-mavx -fma';;
       AVX2)
         AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
         SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index f50eae2b..36360102 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -167,7 +167,7 @@ namespace Optimization {
     }
     //Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
           __m128i a0,a1;
           __m128i b0,b1;
           a0 = _mm256_extractf128_si256(a,0);
@@ -195,7 +195,7 @@ namespace Optimization {
     }
     //Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
           __m128i a0,a1;
           __m128i b0,b1;
           a0 = _mm256_extractf128_si256(a,0);
@@ -216,7 +216,7 @@ namespace Optimization {
   struct MultComplex{
     // Complex float
     inline __m256 operator()(__m256 a, __m256 b){
-#if defined (AVX1) 
+#if defined (AVX1)
       __m256 ymm0,ymm1,ymm2;
       ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
       ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
@@ -233,7 +233,7 @@ namespace Optimization {
       a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
       return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2)  || defined (AVXFMA)
       __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
       __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
       a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -264,7 +264,7 @@ namespace Optimization {
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
       */
-#if defined (AVX1) 
+#if defined (AVX1)
       __m256d ymm0,ymm1,ymm2;
       ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
       ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
@@ -279,7 +279,7 @@ namespace Optimization {
       a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
       return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
       __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
       a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@@ -320,7 +320,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
       a= _mm256_macc_ps(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       a= _mm256_fmadd_ps( b, c, a);
 #endif
     }
@@ -332,7 +332,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
       a= _mm256_macc_pd(b,c,a);
 #endif
-#if defined (AVX2)
+#if defined (AVX2) || defined (AVXFMA)
       a= _mm256_fmadd_pd( b, c, a);
 #endif
     }
@@ -347,7 +347,7 @@ namespace Optimization {
     }
     // Integer
     inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) 
+#if defined (AVX1) || defined (AVXFMA)
       __m128i a0,a1;
       __m128i b0,b1;
       a0 = _mm256_extractf128_si256(a,0);
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 184baad9..080dd5c0 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -44,7 +44,7 @@ directory
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
+#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512

From 5833f247fa7a8f85b6afd5397796455070182792 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 24 Nov 2016 09:09:48 +0900
Subject: [PATCH 02/14] more FFt optimisations

---
 lib/FFT.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index b5b31d82..240f338b 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -244,7 +244,10 @@ namespace Grid {
             pokeLocalSite(s,pgbuf,cbuf);
           }
         }
-        result = Cshift(result,dim,L);
+        if (p != processors[dim] - 1)
+        {
+          result = Cshift(result,dim,L);
+        }
       }
       
       // Loop over orthog coords
@@ -287,10 +290,10 @@ namespace Grid {
           cgbuf = clbuf;
           cgbuf[dim] = clbuf[dim]+L*pc;
           peekLocalSite(s,pgbuf,cgbuf);
-          s = s * div;
           pokeLocalSite(s,result,clbuf);
         }
       }
+      result = result*div;
       
       // destroying plan
       FFTW<scalar>::fftw_destroy_plan(p);

From 3cdf945d840dfb47c8a939bc113425ee3b8ebe68 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 24 Nov 2016 09:10:03 +0900
Subject: [PATCH 03/14] Test_fftf fix

---
 tests/core/Test_fftf.cc | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc
index 4eb4398d..22838f7b 100644
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<4;mu++){
     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
     LatticeCoordinate(coor,mu);
-    C = C - (TwoPiL * p[mu]) * coor;
+    C = C + (TwoPiL * p[mu]) * coor;
   }
 
   C = exp(C*ci);
@@ -78,10 +78,11 @@ int main (int argc, char ** argv)
 
   FFT theFFT(&Fine);
 
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
+  Ctilde = C;
+  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
 
   //  C=zero;
   //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@@ -93,10 +94,11 @@ int main (int argc, char ** argv)
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
 
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
+  Stilde = S;
+  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 
   SpinMatrixF Sp; 
   Sp = zero; Sp = Sp+cVol;

From 0acbf77bc6dbc436e7e9f5df52fafd0f9f521816 Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Thu, 24 Nov 2016 13:24:12 +0000
Subject: [PATCH 04/14] Add QPX Div structure

---
 lib/simd/Grid_qpx.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 07933f52..1cd6edae 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -245,6 +245,21 @@ namespace Optimization {
     }
   };
   
+  struct Div{
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_swdivs(a, b);
+    }
+    
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+    
+    // Integer
+    inline int operator()(int a, int b){
+      return a/b;
+    }
+  }
+  
   struct Conj{
     // Complex double
     inline vector4double operator()(vector4double v){
@@ -412,6 +427,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 // Arithmetic operations
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Div         DivSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;

From b18950f776954213ecda0b4785aec4cb67833680 Mon Sep 17 00:00:00 2001
From: Lanny91 <andrew.lawson@gmail.com>
Date: Fri, 25 Nov 2016 13:21:33 +0000
Subject: [PATCH 05/14] Added simd real divide test with QPX divide fixes

---
 lib/simd/Grid_qpx.h | 14 +++++++-------
 tests/Test_simd.cc  |  8 ++++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 1cd6edae..bc86291d 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -244,22 +244,22 @@ namespace Optimization {
       return a*b;
     }
   };
-  
+
   struct Div{
     // Real double
     inline vector4double operator()(vector4double a, vector4double b){
-      return vec_swdivs(a, b);
+      return vec_swdiv(a, b);
     }
-    
+
     // Real float
     FLOAT_WRAP_2(operator(), inline)
-    
+
     // Integer
     inline int operator()(int a, int b){
       return a/b;
     }
-  }
-  
+  };
+
   struct Conj{
     // Complex double
     inline vector4double operator()(vector4double v){
@@ -427,8 +427,8 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 // Arithmetic operations
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
-typedef Optimization::Div         DivSIMD;
 typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 189f0559..92f9bcd8 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -50,6 +50,12 @@ public:
   template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
   std::string name(void) const { return std::string("Times"); }
 };
+class funcDivide {
+public:
+  funcDivide() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
+  std::string name(void) const { return std::string("Divide"); }
+};
 class funcConj {
 public:
   funcConj() {};
@@ -341,6 +347,7 @@ int main (int argc, char ** argv)
   Tester<RealF,vRealF>(funcPlus());
   Tester<RealF,vRealF>(funcMinus());
   Tester<RealF,vRealF>(funcTimes());
+  Tester<RealF,vRealF>(funcDivide());
   Tester<RealF,vRealF>(funcAdj());
   Tester<RealF,vRealF>(funcConj());
   Tester<RealF,vRealF>(funcInnerProduct());
@@ -371,6 +378,7 @@ int main (int argc, char ** argv)
   Tester<RealD,vRealD>(funcPlus());
   Tester<RealD,vRealD>(funcMinus());
   Tester<RealD,vRealD>(funcTimes());
+  Tester<RealD,vRealD>(funcDivide());
   Tester<RealD,vRealD>(funcAdj());
   Tester<RealD,vRealD>(funcConj());
   Tester<RealD,vRealD>(funcInnerProduct());

From 6c0cc5676bc7969f0d82b73146c9c5f9021c135b Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sat, 26 Nov 2016 18:25:12 +0000
Subject: [PATCH 06/14] Adding Eigen.inc to the gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 38e3da2d..da7de5e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ config.status
 .deps
 Make.inc
 eigen.inc
+Eigen.inc
 
 # http://www.gnu.org/software/autoconf #
 ########################################

From 1e44fd309402c1ae099be6d5371100e7a3dd4c75 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Sat, 26 Nov 2016 18:30:53 +0000
Subject: [PATCH 07/14] Added some details on the mpi flags for Cray machines

---
 README.md    | 2 +-
 configure.ac | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bfe558a2..c47a257c 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
 
-For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
 
 ### Possible SIMD types
 
diff --git a/configure.ac b/configure.ac
index 90764cb7..4225bac5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -290,7 +290,7 @@ esac
 case ${ac_COMMS} in
     *-auto)
         LX_FIND_MPI
-        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
+        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
         AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
         AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
         AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"

From ae9688e343acef3e33776a702faf98b25d78cfaa Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 28 Nov 2016 11:37:02 +0000
Subject: [PATCH 08/14] Reporting also the total mflops

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 4c2d24bf..d2ac96e3 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
     std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 
+    RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+
+
    }
 
   if ( DerivCalls > 0 ) {
@@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
     RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
     std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
     std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-  }
+
+    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
 
   if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
   }
 }
 

From 4704f2d009eec4d6559b55b82ddd56116c213790 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 29 Nov 2016 00:14:36 +0000
Subject: [PATCH 09/14] Actions updated

---
 lib/qcd/action/Actions.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index ba6e577d..4a30f8c3 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -195,6 +195,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
+
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@@ -203,6 +204,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 
+// Ls vectorised 
+typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
+typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
+typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
+
+typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
+typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
+typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
+
+typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
+typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
+typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
+
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@@ -254,6 +269,7 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
 
 
+
   }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code

From 2f92b4860b6e653266fd49bdabcf2690bffbc39b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 29 Nov 2016 00:15:08 +0000
Subject: [PATCH 10/14] Test the full Mooee sector

---
 benchmarks/Benchmark_mooee.cc | 237 ++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 benchmarks/Benchmark_mooee.cc

diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
new file mode 100644
index 00000000..c895109f
--- /dev/null
+++ b/benchmarks/Benchmark_mooee.cc
@@ -0,0 +1,237 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> latt4 = GridDefaultLatt();
+  const int Ls=8;
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  std::cout << GridLogMessage << "Seeded"<<std::endl;
+
+  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+
+
+  if (1)
+  {
+    const int ncall=100;
+
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
+    GridParallelRNG RNG5(FGrid);
+    LatticeFermion src(FGrid); random(RNG5,src);
+    LatticeFermion result(FGrid);
+
+    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+    FGrid->Barrier();
+
+    double t0,t1;
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+
+    std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    LatticeFermion r_eo(FGrid);
+    LatticeFermion src_e (FrbGrid);
+    LatticeFermion src_o (FrbGrid);
+    LatticeFermion r_e   (FrbGrid);
+    LatticeFermion r_o   (FrbGrid);
+    
+    pickCheckerboard(Even,src_e,src);
+    pickCheckerboard(Odd,src_o,src);
+    
+    setCheckerboard(r_eo,src_o);
+    setCheckerboard(r_eo,src_e);
+    
+    r_e = zero;
+    r_o = zero;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.DhopEO(src_o, r_e, DaggerNo);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.Mooee(src_o, r_o);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.MooeeInv(src_o, r_o);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.Meooe(src_o, r_e);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+  }
+
+  if (1)
+  {
+    const int ncall=100;
+
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
+    GridParallelRNG RNG5(sFGrid);
+    LatticeFermion src(sFGrid); random(RNG5,src);
+    LatticeFermion sref(sFGrid);
+    LatticeFermion result(sFGrid);
+
+    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
+    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
+
+    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
+    FGrid->Barrier();
+
+    double t0,t1;
+    t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+
+    std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    LatticeFermion r_eo(sFGrid);
+    LatticeFermion src_e (sFrbGrid);
+    LatticeFermion src_o (sFrbGrid);
+    LatticeFermion r_e   (sFrbGrid);
+    LatticeFermion r_o   (sFrbGrid);
+    
+    pickCheckerboard(Even,src_e,src);
+    pickCheckerboard(Odd,src_o,src);
+    
+    setCheckerboard(r_eo,src_o);
+    setCheckerboard(r_eo,src_e);
+    
+    r_e = zero;
+    r_o = zero;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.DhopEO(src_o, r_e, DaggerNo);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.Mooee(src_o, r_o);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.MooeeInv(src_o, r_o);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+
+    FGrid->Barrier();
+    t0=usecond();
+    for (int i = 0; i < ncall; i++) {
+      Dw.Meooe(src_o, r_e);
+    }
+    t1=usecond();
+    FGrid->Barrier();
+    std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+
+  }
+
+
+
+  Grid_finalize();
+}

From bd0430b34f8cde299a2833bab45628da8d2cbd56 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 29 Nov 2016 22:27:55 +0000
Subject: [PATCH 11/14] Serialisation in malloc fixed

---
 benchmarks/Benchmark_mooee.cc                 | 128 +++++-------------
 lib/qcd/action/fermion/CayleyFermion5D.cc     |  44 ++++++
 lib/qcd/action/fermion/CayleyFermion5D.h      |  12 ++
 .../action/fermion/CayleyFermion5Dcache.cc    |  20 +++
 lib/qcd/action/fermion/CayleyFermion5Dvec.cc  |  22 ++-
 5 files changed, 128 insertions(+), 98 deletions(-)

diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
index c895109f..097e6da3 100644
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
 
   if (1)
   {
-    const int ncall=100;
+    const int ncall=1000;
 
     std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
     std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
@@ -81,18 +81,7 @@ int main (int argc, char ** argv)
     LatticeFermion result(FGrid);
 
     DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    FGrid->Barrier();
-
     double t0,t1;
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-
-    std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
 
     LatticeFermion r_eo(FGrid);
     LatticeFermion src_e (FrbGrid);
@@ -109,48 +98,44 @@ int main (int argc, char ** argv)
     r_e = zero;
     r_o = zero;
 
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.DhopEO(src_o, r_e, DaggerNo);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
 
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Mooee(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
+#define BENCH_DW(A,in,out)			\
+    Dw.CayleyZeroCounters();			\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      Dw. A (in,out);				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    Dw.CayleyReport();					\
+    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;
 
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.MooeeInv(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
+#define BENCH_DW_MEO(A,in,out)			\
+    Dw.CayleyZeroCounters();			\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      Dw. A (in,out,0);				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    Dw.CayleyReport();					\
+    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;
 
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Meooe(src_o, r_e);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+    BENCH_DW_MEO(Dhop    ,src,result);
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
+    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW(Mooee   ,src_o,r_o);
+    BENCH_DW(MooeeInv,src_o,r_o);
 
   }
 
   if (1)
   {
-    const int ncall=100;
+    const int ncall=1000;
 
     std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
     std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
@@ -168,14 +153,6 @@ int main (int argc, char ** argv)
     FGrid->Barrier();
 
     double t0,t1;
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-
-    std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
 
     LatticeFermion r_eo(sFGrid);
     LatticeFermion src_e (sFrbGrid);
@@ -192,46 +169,13 @@ int main (int argc, char ** argv)
     r_e = zero;
     r_o = zero;
 
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.DhopEO(src_o, r_e, DaggerNo);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Mooee(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.MooeeInv(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Meooe(src_o, r_e);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+    BENCH_DW_MEO(Dhop    ,src,result);
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
+    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW(Mooee   ,src_o,r_o);
+    BENCH_DW(MooeeInv,src_o,r_o);
 
   }
 
-
-
   Grid_finalize();
 }
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index 57b047d4..b8e98dce 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
   }
 }
+
+
+template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
+{
+  this->Report();
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP     = this->_FourDimGrid->_Nprocessors;
+  if ( M5Dcalls > 0 ) {
+    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+  if ( MooeeInvCalls > 0 ) {
+
+    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
+
+    // Flops = 9*12*Ls*vol/2
+    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+}
+template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
+{
+  this->ZeroCounters();
+  M5Dflops=0;
+  M5Dcalls=0;
+  M5Dtime=0;
+  MooeeInvFlops=0;
+  MooeeInvCalls=0;
+  MooeeInvTime=0;
+}
+
+
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h
index 1d8c2b95..6fb58234 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -120,6 +120,18 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 
+      
+
+     void CayleyReport(void);
+     void CayleyZeroCounters(void);
+
+     double M5Dflops;
+     double M5Dcalls;
+     double M5Dtime;
+
+     double MooeeInvFlops;
+     double MooeeInvCalls;
+     double MooeeInvTime;
 
     protected:
       void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
index 62e91dd4..8e7df945 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
   GridBase *grid=psi._grid;
   assert(phi.checkerboard == psi.checkerboard);
   chi.checkerboard=psi.checkerboard;
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
     for(int s=0;s<Ls;s++){
@@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
       }
     }
   }
+  M5Dtime+=usecond();
 }
 
 template<class Impl>  
@@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
   assert(phi.checkerboard == psi.checkerboard);
   chi.checkerboard=psi.checkerboard;
 
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
     auto tmp = psi._odata[0];
@@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
       }
     }
   }
+  M5Dtime+=usecond();
 }
 
 template<class Impl>
@@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
 
   chi.checkerboard=psi.checkerboard;
 
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
     auto tmp = psi._odata[0];
 
+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
     // Apply (L^{\prime})^{-1}
     chi[ss]=psi[ss]; // chi[0]=psi[0]
     for(int s=1;s<Ls;s++){
@@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
       chi[ss+s] = chi[ss+s] - uee[s]*tmp;
     }
   }
+
+  MooeeInvTime+=usecond();
+
 }
 
 template<class Impl>
@@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
   assert(psi.checkerboard == psi.checkerboard);
   chi.checkerboard=psi.checkerboard;
 
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
 
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
       chi[ss+s] = chi[ss+s] - lee[s]*tmp;
     }
   }
+
+  MooeeInvTime+=usecond();
+
 }
 
 #ifdef CAYLEY_DPERP_CACHE
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index f6569923..3f3f215c 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -71,6 +71,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 
   chi.checkerboard=psi.checkerboard;
 
+
   // just directly address via type pun
   typedef typename Simd::scalar_type scalar_type;
   scalar_type * u_p = (scalar_type *)&u[0];
@@ -86,6 +87,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
     d_p[ss] = diag[s];
   }}
 
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 
@@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP
 
     }
   }
+  M5Dtime+=usecond();
 }
 
 template<class Impl>  
@@ -154,6 +158,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
     d_p[ss] = diag[s];
   }}
 
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 
@@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP
 
     }
   }
+  M5Dtime+=usecond();
 }
 
 template<class Impl>
@@ -250,13 +257,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
     }
   }
   
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
   // Dynamic allocate on stack to get per thread without serialised heap acces
-PARALLEL_FOR_LOOP
-  for(auto site=0;site<vol;site++){
-    
-    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+#pragma omp parallel  
+  {
 
     Vector<SiteHalfSpinor> SitePplus(LLs);
     Vector<SiteHalfSpinor> SitePminus(LLs);
@@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP
     SiteHalfSpinor BcastP;
     SiteHalfSpinor BcastM;
 
+#pragma omp for 
+  for(auto site=0;site<vol;site++){
+
     for(int s=0;s<LLs;s++){
       int lex = s+LLs*site;
       spProj5p(SitePplus[s] ,psi[lex]);
@@ -294,6 +302,8 @@ PARALLEL_FOR_LOOP
       chi[lex] = SiteChi[s]*0.5;
     }
   }
+  }
+  MooeeInvTime+=usecond();
 }
 
 INSTANTIATE_DPERP(DomainWallVec5dImplD);

From cd01c1dbe9a706a01a5d6a100dde5d9ca4f1ff0a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@localhost.localdomain>
Date: Wed, 30 Nov 2016 22:11:10 +0000
Subject: [PATCH 12/14] Ls 16 more relevant

---
 benchmarks/Benchmark_mooee.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
index 097e6da3..df80288c 100644
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -41,7 +41,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=8;
+  const int Ls=16;
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);

From 6adf35da54fb6df2d71980447c8d6271ab34565b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 1 Dec 2016 11:39:04 +0000
Subject: [PATCH 13/14] Faster Mobius

---
 lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 137 ++++++++++++++++---
 lib/simd/Grid_vector_types.h                 |   2 +-
 2 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index 3f3f215c..cfd96aaf 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
   GridBase *grid=psi._grid;
   int Ls   = this->Ls;
   int LLs  = grid->_rdimensions[0];
-  int nsimd= Simd::Nsimd();
+  const int nsimd= Simd::Nsimd();
 
   Vector<iSinglet<Simd> > u(LLs);
   Vector<iSinglet<Simd> > l(LLs);
@@ -71,7 +71,6 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 
   chi.checkerboard=psi.checkerboard;
 
-
   // just directly address via type pun
   typedef typename Simd::scalar_type scalar_type;
   scalar_type * u_p = (scalar_type *)&u[0];
@@ -87,36 +86,133 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
     d_p[ss] = diag[s];
   }}
 
+
   M5Dcalls++;
   M5Dtime-=usecond();
+
+  assert(Nc==3);
+
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+#if 0
+      alignas(64) SiteHalfSpinor hp;
+      alignas(64) SiteHalfSpinor hm;
+      alignas(64) SiteSpinor fp;
+      alignas(64) SiteSpinor fm;
 
-    alignas(64) SiteHalfSpinor hp;
-    alignas(64) SiteHalfSpinor hm;
-    alignas(64) SiteSpinor fp;
-    alignas(64) SiteSpinor fm;
+      for(int v=0;v<LLs;v++){
 
-    for(int v=0;v<LLs;v++){
+	int vp=(v+1)%LLs;
+	int vm=(v+LLs-1)%LLs;
 
-      int vp=(v+1)%LLs;
-      int vm=(v+LLs-1)%LLs;
+	spProj5m(hp,psi[ss+vp]);
+	spProj5p(hm,psi[ss+vm]);
 
-      spProj5m(hp,psi[ss+vp]);
-      spProj5p(hm,psi[ss+vm]);
+	if ( vp<=v ) rotate(hp,hp,1);
+	if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	
+	hp=0.5*hp;
+        hm=0.5*hm;
+
+	spRecon5m(fp,hp);
+	spRecon5p(fm,hm);
+
+	chi[ss+v] = d[v]*phi[ss+v];
+	chi[ss+v] = chi[ss+v]     +u[v]*fp;
+	chi[ss+v] = chi[ss+v]     +l[v]*fm;
+
+      }
+#else
+      for(int v=0;v<LLs;v++){
       
-      if ( vp<=v ) rotate(hp,hp,1);
-      if ( vm>=v ) rotate(hm,hm,nsimd-1);
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(2)(0); 
+	Simd hp_01 = psi[ss+vp]()(2)(1); 
+	Simd hp_02 = psi[ss+vp]()(2)(2); 
+	Simd hp_10 = psi[ss+vp]()(3)(0); 
+	Simd hp_11 = psi[ss+vp]()(3)(1); 
+	Simd hp_12 = psi[ss+vp]()(3)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(0)(0); 
+	Simd hm_01 = psi[ss+vm]()(0)(1); 
+	Simd hm_02 = psi[ss+vm]()(0)(2); 
+	Simd hm_10 = psi[ss+vm]()(1)(0); 
+	Simd hm_11 = psi[ss+vm]()(1)(1); 
+	Simd hm_12 = psi[ss+vm]()(1)(2); 
 
-      hp=hp*0.5;
-      hm=hm*0.5;
-      spRecon5m(fp,hp);
-      spRecon5p(fm,hm);
+	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
+	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 
-      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
-      chi[ss+v] = chi[ss+v]     +l[v]*fm;
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
 
-    }
+	/*
+	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
+	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
+	*/	
+	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
+	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
+	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
+	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
+	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
+	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
+	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
+	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
+	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
+	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
+	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
+	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
+
+	
+	//	if ( ss==0){
+	/*
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
+	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
+	}
+	*/
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+
+      }
+#endif
   }
   M5Dtime+=usecond();
 }
@@ -191,7 +287,6 @@ PARALLEL_FOR_LOOP
   }
   M5Dtime+=usecond();
 }
-
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 080dd5c0..42f28b34 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -130,7 +130,7 @@ class Grid_simd {
 
   Vector_type v;
 
-  static inline int Nsimd(void) {
+  static inline constexpr int Nsimd(void) {
     return sizeof(Vector_type) / sizeof(Scalar_type);
   }
 

From e27c6b217c64edc343c054e77b6fa57b58963f93 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@localhost.localdomain>
Date: Thu, 1 Dec 2016 12:42:53 +0000
Subject: [PATCH 14/14] Updating

---
 benchmarks/Benchmark_mooee.cc                | 2 ++
 lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 5 ++++-
 lib/simd/Grid_avx512.h                       | 8 ++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc
index df80288c..dfaea627 100644
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@@ -101,6 +101,7 @@ int main (int argc, char ** argv)
 
 #define BENCH_DW(A,in,out)			\
     Dw.CayleyZeroCounters();			\
+    Dw. A (in,out);				\
     FGrid->Barrier();				\
     t0=usecond();				\
     for(int i=0;i<ncall;i++){			\
@@ -114,6 +115,7 @@ int main (int argc, char ** argv)
 
 #define BENCH_DW_MEO(A,in,out)			\
     Dw.CayleyZeroCounters();			\
+    Dw. A (in,out,0);				\
     FGrid->Barrier();				\
     t0=usecond();				\
     for(int i=0;i<ncall;i++){			\
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index cfd96aaf..35a10de2 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -124,7 +124,10 @@ PARALLEL_FOR_LOOP
       }
 #else
       for(int v=0;v<LLs;v++){
-      
+
+	vprefetch(psi[ss+v+LLs]);
+	//	vprefetch(phi[ss+v+LLs]);
+
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
 	
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 38785138..d6531d57 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -86,13 +86,13 @@ namespace Optimization {
   struct Vstream{
     //Float
     inline void operator()(float * a, __m512 b){
-      //_mm512_stream_ps(a,b);
-      _mm512_store_ps(a,b);
+      _mm512_stream_ps(a,b);
+      //      _mm512_store_ps(a,b);
     }
     //Double
     inline void operator()(double * a, __m512d b){
-      //_mm512_stream_pd(a,b);
-      _mm512_store_pd(a,b);
+      _mm512_stream_pd(a,b);
+      //      _mm512_store_pd(a,b);
     }
 
   };