From 3cbe974eb48eb6ee97cb420b08475beeda5cc71b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:55:21 +0100
Subject: [PATCH 01/16] Layout

---
 lib/tensors/Tensor_extract_merge.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h
index 41a431ad..a32d3785 100644
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
   }
  }
 
-template<class vobj> inline 
-void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
+template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
   typedef typename vobj::scalar_type scalar_type ;
   typedef typename vobj::vector_type vector_type ;
@@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
   }}
 }
 
-template<class vobj> inline 
-void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
+template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
   typedef typename vobj::scalar_type scalar_type ;
   typedef typename vobj::vector_type vector_type ;

From 8c043da5b7dd583c79e0b540277060e6311d27cc Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:56:05 +0100
Subject: [PATCH 02/16] SHMEM and comms allocator made different

---
 lib/AlignedAllocator.h | 130 ++++++++++++++++++++++++++++-------------
 1 file changed, 88 insertions(+), 42 deletions(-)

diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 8f69971d..d0de1ec3 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <mm_malloc.h>
 #endif
 
-#ifdef GRID_COMMS_SHMEM
-extern "C" { 
-#include <mpp/shmem.h>
-extern void * shmem_align(size_t, size_t);
-extern void  shmem_free(void *);
-}
-#endif
-
 namespace Grid {
 
 ////////////////////////////////////////////////////////////////////
@@ -65,28 +57,81 @@ public:
   typedef _Tp        value_type;
 
   template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
-
   alignedAllocator() throw() { }
-
   alignedAllocator(const alignedAllocator&) throw() { }
-
   template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
-
   ~alignedAllocator() throw() { }
-
   pointer       address(reference __x)       const { return &__x; }
-  //  const_pointer address(const_reference __x) const { return &__x; }
-
   size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 
   pointer allocate(size_type __n, const void* _p= 0)
   { 
+#ifdef HAVE_MM_MALLOC_H
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+#else
+    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#endif
+
+    _Tp tmp;
+#ifdef GRID_NUMA
+#pragma omp parallel for schedule(static)
+  for(int i=0;i<__n;i++){
+    ptr[i]=tmp;
+  }
+#endif 
+    return ptr;
+  }
+
+  void deallocate(pointer __p, size_type) { 
+#ifdef HAVE_MM_MALLOC_H
+    _mm_free((void *)__p); 
+#else
+    free((void *)__p);
+#endif
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// MPI3 : comms must use shm region
+// SHMEM: comms must use symmetric heap
+//////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
-
-    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
-
-
+extern "C" { 
+#include <mpp/shmem.h>
+extern void * shmem_align(size_t, size_t);
+extern void  shmem_free(void *);
+}
 #define PARANOID_SYMMETRIC_HEAP
+#endif
+
+template<typename _Tp>
+class commAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
+  commAllocator() throw() { }
+  commAllocator(const commAllocator&) throw() { }
+  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
+  ~commAllocator() throw() { }
+  pointer       address(reference __x)       const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+#ifdef GRID_COMMS_SHMEM
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #ifdef PARANOID_SYMMETRIC_HEAP
     static void * bcast;
     static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
@@ -99,51 +144,52 @@ public:
       BACKTRACEFILE();
       exit(0);
     }
-
     assert( bcast == (void *) ptr);
-
 #endif 
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type) { 
+    shmem_free((void *)__p);
+  }
+#elif defined(GRID_COMMS_MPI3)
+  pointer allocate(size_type __n, const void* _p= 0)
+  { 
+#error "implement MPI3 windowed allocate"
+    
+  }
+  void deallocate(pointer __p, size_type) { 
+#error "implement MPI3 windowed allocate"
+  }
 #else
-
+  pointer allocate(size_type __n, const void* _p= 0)
 #ifdef HAVE_MM_MALLOC_H
     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
-
-#endif
-    _Tp tmp;
-#ifdef GRID_NUMA
-#pragma omp parallel for schedule(static)
-  for(int i=0;i<__n;i++){
-    ptr[i]=tmp;
-  }
-#endif 
     return ptr;
   }
-
   void deallocate(pointer __p, size_type) { 
-#ifdef GRID_COMMS_SHMEM
-    shmem_free((void *)__p);
-#else
 #ifdef HAVE_MM_MALLOC_H
     _mm_free((void *)__p); 
 #else
     free((void *)__p);
-#endif
 #endif
   }
+#endif
   void construct(pointer __p, const _Tp& __val) { };
   void construct(pointer __p) { };
-
   void destroy(pointer __p) { };
 };
+template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
+template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
 
-template<typename _Tp>  inline bool
-operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
-
-template<typename _Tp>  inline bool
-operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+////////////////////////////////////////////////////////////////////////////////
+// Template typedefs
+////////////////////////////////////////////////////////////////////////////////
+template<class T> using Vector    = std::vector<T,alignedAllocator<T> >;           
+template<class T> using commVector = std::vector<T,commAllocator<T> >;              
+template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
     
 }; // namespace Grid
 #endif

From 39f1c880b8172b0354064e4971a27e3cf3aef3d6 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:56:40 +0100
Subject: [PATCH 03/16] mpi3

---
 configure.ac | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/configure.ac b/configure.ac
index 7bcdc49f..d6340a71 100644
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ac_COMMS} in
      mpi)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
      ;;
+     mpi3)
+       AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
+     ;;
      shmem)
        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
      ;;
@@ -253,6 +256,7 @@ case ${ac_COMMS} in
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 
 ############### RNG selection

From 4955672fc369baa88c83708838b8aef5502f8399 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:57:00 +0100
Subject: [PATCH 04/16] MPI3

---
 lib/Cshift.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Cshift.h b/lib/Cshift.h
index bbc56cae..4eeb6cea 100644
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 
+#ifdef GRID_COMMS_MPI3
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 

From cbcfea466f75f185a171ffe529b9a14b7938aff3 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:57:14 +0100
Subject: [PATCH 05/16] MPI3

---
 lib/Makefile.am | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Makefile.am b/lib/Makefile.am
index bc752af5..35e76cc8 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -3,6 +3,10 @@ if BUILD_COMMS_MPI
   extra_sources+=communicator/Communicator_mpi.cc
 endif
 
+if BUILD_COMMS_MPI3
+  extra_sources+=communicator/Communicator_mpi3.cc
+endif
+
 if BUILD_COMMS_SHMEM
   extra_sources+=communicator/Communicator_shmem.cc
 endif

From c7cccaaa69e7888db33fe8c8c82612da43dee15c Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:57:31 +0100
Subject: [PATCH 06/16] Comm vector for shmem

---
 lib/Stencil.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index db608fe7..65de9945 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -71,7 +71,7 @@
  namespace Grid {
 
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
 {
   table.resize(0);
   int rd = rhs._grid->_rdimensions[dimension];
@@ -109,7 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali
 }
 
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
 			   compressor &compress, int off,int so)
 {
 PARALLEL_FOR_LOOP     
@@ -119,7 +119,7 @@ PARALLEL_FOR_LOOP
 }
 
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
+Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 			     double &t_table ,double & t_data )
 {
   std::vector<std::pair<int,int> > table;
@@ -347,10 +347,10 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
        }
 
        // Comms buffers
-       std::vector<Vector<scalar_object> > u_simd_send_buf;
-       std::vector<Vector<scalar_object> > u_simd_recv_buf;
-       Vector<cobj>          u_send_buf;
-       Vector<cobj>          comm_buf;
+       std::vector<commVector<scalar_object> > u_simd_send_buf;
+       std::vector<commVector<scalar_object> > u_simd_recv_buf;
+       commVector<cobj>          u_send_buf;
+       commVector<cobj>          comm_buf;
        int u_comm_offset;
        int _unified_buffer_size;
 

From 5fe2b85cbd951e2a0137e8ce291f0556bbb091ea Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:58:01 +0100
Subject: [PATCH 07/16] MPI3 and shared memory support

---
 lib/communicator/Communicator_base.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index 94d277e9..9b5ae8cb 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
+#ifdef GRID_COMMS_MPI3
+#include <mpi.h>
+#endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
@@ -52,6 +55,29 @@ class CartesianCommunicator {
 #ifdef GRID_COMMS_MPI
     MPI_Comm communicator;
     typedef MPI_Request CommsRequest_t;
+#elif  GRID_COMMS_MPI3
+    MPI_Comm communicator;
+    typedef MPI_Request CommsRequest_t;
+
+    const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
+
+    std::vector<int>  WorldDims;
+    std::vector<int>  GroupDims;
+    std::vector<int>  ShmDims;
+
+    std::vector<int> GroupCoor;
+    std::vector<int> ShmCoor;
+    std::vector<int> WorldCoor;
+
+    int GroupRank;
+    int ShmRank;
+    int WorldRank;
+
+    int GroupSize;
+    int ShmSize;
+    int WorldSize;
+
+    std::vector<int>  LexicographicToWorldRank;
 #else 
     typedef int CommsRequest_t;
 #endif

From 9b39f35ae6c3942f21541745cb50b79ec8077981 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:58:53 +0100
Subject: [PATCH 08/16] commVector different for SHMEM compat

---
 lib/cshift/Cshift_common.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index b0e9b798..2b146daa 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -45,7 +45,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
   int rd = rhs._grid->_rdimensions[dimension];
 
@@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2
 	int o      =   n*n1;
 	int offset = b+n*n2;
 	cobj temp =compress(rhs._odata[so+o+b]);
+
 	extract<cobj>(temp,pointers,offset);
 
       }
@@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2
   } else { 
 
     assert(0); //Fixme think this is buggy
+
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
@@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
   SimpleCompressor<vobj> dontcompress;
   Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
   int rd = rhs._grid->_rdimensions[dimension];
 

From 4f8e636a438007a14ff95b2e59fe6297a6c38283 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:59:16 +0100
Subject: [PATCH 09/16] commVector

---
 lib/cshift/Cshift_mpi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/cshift/Cshift_mpi.h b/lib/cshift/Cshift_mpi.h
index 704fda34..b3c07cd6 100644
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
   assert(shift<fd);
   
   int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
-  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
 
   int cb= (cbmask==0x2)? Odd : Even;
   int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
   int words = sizeof(vobj)/sizeof(vector_type);
 
-  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
-  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
 
   int bytes = buffer_size*sizeof(scalar_object);
 

From f9d5e95d72291c54a5be219e9a3333e9296a45da Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 16:59:39 +0100
Subject: [PATCH 10/16] allocator template typedefs moved to AlignedAllocator

---
 lib/lattice/Lattice_base.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index 3bfa5613..e04c1443 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -65,9 +65,6 @@ public:
     
 class LatticeExpressionBase {};
 
-template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
-template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
-
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
  public:

From b58adc6a4bbfa461acb8cdc3d90133ea04ecc35f Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 17:00:15 +0100
Subject: [PATCH 11/16] commVector

---
 lib/communicator/Communicator_mpi3.cc       | 358 ++++++++++++++++++++
 lib/qcd/action/fermion/WilsonKernels.cc     |   6 +-
 lib/qcd/action/fermion/WilsonKernels.h      |  22 +-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc  |  16 +-
 lib/qcd/action/fermion/WilsonKernelsHand.cc |  16 +-
 5 files changed, 388 insertions(+), 30 deletions(-)
 create mode 100644 lib/communicator/Communicator_mpi3.cc

diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
new file mode 100644
index 00000000..11734fd9
--- /dev/null
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -0,0 +1,358 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_mpi.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include "Grid.h"
+#include <mpi.h>
+
+namespace Grid {
+
+// Global used by Init and nowhere else. How to hide?
+int Rank(void) {
+  int pe;
+  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+  return pe;
+}
+  // Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init(argc,argv);
+  }
+}
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Want to implement some magic ... Group sub-cubes into those on same node
+  //
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  std::vector<int> coor = _processor_coor;
+
+  assert(std::abs(shift) <_processors[dim]);
+
+  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
+  Lexicographic::IndexFromCoor(coor,source,_processors);
+  source = LexicographicToWorldRank[source];
+
+  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
+  Lexicographic::IndexFromCoor(coor,dest,_processors);
+  dest = LexicographicToWorldRank[dest];
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  Lexicographic::IndexFromCoor(coor,rank,_processors);
+  rank = LexicographicToWorldRank[rank];
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  Lexicographic::CoorFromIndex(coor,rank,_processors);
+  rank = LexicographicToWorldRank[rank];
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{ 
+  _ndimension = processors.size();
+  std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
+  for(int d =0;d<_ndimension;d++){
+    std::cout << processors[d]<<" ";
+  };
+  std::cout << std::endl;
+
+  WorldDims = processors;
+
+  communicator = MPI_COMM_WORLD;
+  MPI_Comm shmcomm;
+  MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
+  MPI_Comm_rank(communicator,&WorldRank);
+  MPI_Comm_size(communicator,&WorldSize);
+  MPI_Comm_rank(shmcomm     ,&ShmRank);
+  MPI_Comm_size(shmcomm     ,&ShmSize);
+  GroupSize = WorldSize/ShmSize;
+
+  std::cout<< "Ranks per node "<< ShmSize << std::endl;
+  std::cout<< "Nodes          "<< GroupSize << std::endl;
+  std::cout<< "Ranks          "<< WorldSize << std::endl;
+  
+  ////////////////////////////////////////////////////////////////
+  // Assert power of two shm_size.
+  ////////////////////////////////////////////////////////////////
+  int log2size = -1;
+  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
+    if ( (0x1<<i) == ShmSize ) {
+      log2size = i;
+      break;
+    }
+  }
+  assert(log2size != -1);
+  
+  ////////////////////////////////////////////////////////////////
+  // Identify subblock of ranks on node spreading across dims
+  // in a maximally symmetrical way
+  ////////////////////////////////////////////////////////////////
+  int dim = 0;
+  
+  ShmDims.resize(_ndimension,1);
+  GroupDims.resize(_ndimension);
+  
+  ShmCoor.resize(_ndimension);
+  GroupCoor.resize(_ndimension);
+  WorldCoor.resize(_ndimension);
+  for(int l2=0;l2<log2size;l2++){
+    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
+    ShmDims[dim]*=2;
+    dim=(dim+1)%_ndimension;
+  }
+  
+  std::cout << "Shm group dims "<<std::endl;
+  for(int d =0;d<_ndimension;d++){
+    std::cout << ShmDims[d]<<" ";
+  };
+  std::cout << std::endl;
+
+  ////////////////////////////////////////////////////////////////
+  // Establish torus of processes and nodes with sub-blockings
+  ////////////////////////////////////////////////////////////////
+  for(int d=0;d<_ndimension;d++){
+    GroupDims[d] = WorldDims[d]/ShmDims[d];
+  }
+  std::cout << "Group dims "<<std::endl;
+  for(int d =0;d<_ndimension;d++){
+    std::cout << GroupDims[d]<<" ";
+  };
+  std::cout << std::endl;
+  
+  MPI_Group WorldGroup, ShmGroup;
+  MPI_Comm_group (communicator, &WorldGroup); 
+  MPI_Comm_group (shmcomm, &ShmGroup);
+  
+  std::vector<int> world_ranks(WorldSize); 
+  std::vector<int> group_ranks(WorldSize); 
+  std::vector<int> mygroup(GroupSize);
+  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
+  
+  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]); 
+
+  ////////////////////////////////////////////////////////////////
+  // Check processor counts match
+  ////////////////////////////////////////////////////////////////
+  _Nprocessors=1;
+  _processors = processors;
+  _processor_coor.resize(_ndimension);
+  for(int i=0;i<_ndimension;i++){
+    std::cout << " p " << _processors[i]<<std::endl;
+    _Nprocessors*=_processors[i];
+  }
+  std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
+  assert(WorldSize==_Nprocessors);
+  
+  ///////////////////////////////////////////////////////////////////
+  // Identify who is in my group and noninate the leader
+  ///////////////////////////////////////////////////////////////////
+  int g=0;
+  for(int rank=0;rank<WorldSize;rank++){
+    if(group_ranks[rank]!=MPI_UNDEFINED){
+	  mygroup[g] = rank;
+    }
+  }
+  
+  std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
+  int myleader = mygroup[0];
+  
+  std::vector<int> leaders_1hot(WorldSize,0);
+  std::vector<int> leaders_group(GroupSize,0);
+  leaders_1hot [ myleader ] = 1;
+      
+  ///////////////////////////////////////////////////////////////////
+  // global sum leaders over comm world
+  ///////////////////////////////////////////////////////////////////
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
+  assert(ierr==0);
+  
+  ///////////////////////////////////////////////////////////////////
+  // find the group leaders world rank
+  ///////////////////////////////////////////////////////////////////
+  int group=0;
+  for(int l=0;l<WorldSize;l++){
+    if(leaders_1hot[l]){
+      leaders_group[group++] = l;
+    }
+  }
+  
+  ///////////////////////////////////////////////////////////////////
+  // Identify the rank of the group in which I (and my leader) live
+  ///////////////////////////////////////////////////////////////////
+  GroupRank=-1;
+  for(int g=0;g<GroupSize;g++){
+    if (myleader == leaders_group[g]){
+      GroupRank=g;
+    }
+  }
+  assert(GroupRank!=-1);
+      
+  ////////////////////////////////////////////////////////////////
+  // Establish mapping between lexico physics coord and WorldRank
+  // 
+  ////////////////////////////////////////////////////////////////
+  LexicographicToWorldRank.resize(WorldSize,0);
+  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
+  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
+  for(int d=0;d<_ndimension;d++){
+    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
+  }
+  _processor_coor = WorldCoor;
+
+  int lexico;
+  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
+  LexicographicToWorldRank[lexico]=WorldRank;
+  _processor = lexico;
+
+  ///////////////////////////////////////////////////////////////////
+  // global sum Lexico to World mapping
+  ///////////////////////////////////////////////////////////////////
+  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
+  assert(ierr==0);
+  
+};
+
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  MPI_Status stat;
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      MPI_COMM_WORLD);
+  assert(ierr==0);
+}
+
+}
+
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index 49bd98c5..58e62cc8 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -45,7 +45,7 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
     int sU, const FermionField &in, FermionField &out) {
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
@@ -222,7 +222,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptGenericDhopSite(
     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
     int sU, const FermionField &in, FermionField &out) {
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
@@ -398,7 +398,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptDhopDir(
     StencilImpl &st, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
     int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
   SiteHalfSpinor tmp;
   SiteHalfSpinor chi;
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index 23c145de..66770c3c 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -58,7 +58,7 @@ namespace Grid {
       typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
 	DiracOptDhopSite(
 			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			 commVector<SiteHalfSpinor> &buf,
 			 int sF, int sU, int Ls, int Ns, const FermionField &in,
 			 FermionField &out) {
 #ifdef AVX512
@@ -89,7 +89,7 @@ namespace Grid {
 	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 	  DiracOptDhopSite(
 			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			   commVector<SiteHalfSpinor> &buf,
 			   int sF, int sU, int Ls, int Ns, const FermionField &in,
 			   FermionField &out) {
 	  for (int site = 0; site < Ns; site++) {
@@ -107,7 +107,7 @@ namespace Grid {
 				  void>::type
 	  DiracOptDhopSiteDag(
 			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			      commVector<SiteHalfSpinor> &buf,
 			      int sF, int sU, int Ls, int Ns, const FermionField &in,
 			      FermionField &out) {
 #ifdef AVX512
@@ -139,7 +139,7 @@ namespace Grid {
 				      void>::type
 				      DiracOptDhopSiteDag(
 							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							  commVector<SiteHalfSpinor> &buf,
 							  int sF, int sU, int Ls, int Ns, const FermionField &in,
 							  FermionField &out) {
 					for (int site = 0; site < Ns; site++) {
@@ -154,7 +154,7 @@ namespace Grid {
 
 				    void DiracOptDhopDir(
 							 StencilImpl &st, DoubledGaugeField &U,
-							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							 commVector<SiteHalfSpinor> &buf,
 							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
 							 int gamma);
 
@@ -162,34 +162,34 @@ namespace Grid {
 				    // Specialised variants
 				    void DiracOptGenericDhopSite(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 commVector<SiteHalfSpinor> &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 
 				    void DiracOptGenericDhopSiteDag(
 								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								    commVector<SiteHalfSpinor> &buf,
 								    int sF, int sU, const FermionField &in, FermionField &out);
 
 				    void DiracOptAsmDhopSite(
 							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							     commVector<SiteHalfSpinor> &buf,
 							     int sF, int sU, int Ls, int Ns, const FermionField &in,
 							     FermionField &out);
 
 				    void DiracOptAsmDhopSiteDag(
 								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								commVector<SiteHalfSpinor> &buf,
 								int sF, int sU, int Ls, int Ns, const FermionField &in,
 								FermionField &out);
 
 				    void DiracOptHandDhopSite(
 							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							      commVector<SiteHalfSpinor> &buf,
 							      int sF, int sU, const FermionField &in, FermionField &out);
 
 				    void DiracOptHandDhopSiteDag(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 commVector<SiteHalfSpinor> &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 
 	public:
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index b09699ef..7857e89a 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -40,14 +40,14 @@ namespace Grid {
     ///////////////////////////////////////////////////////////
     template<class Impl>
       void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                             commVector<SiteHalfSpinor>  &buf,
                              int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
     {
       assert(0);
     }
     template<class Impl>
       void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                                commVector<SiteHalfSpinor>  &buf,
                                 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
     {
       assert(0);
@@ -86,14 +86,14 @@ namespace Grid {
 #undef KERNEL_DAG
     template<>
     void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 commVector<SiteHalfSpinor>  &buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
       
 #define KERNEL_DAG
     template<>
     void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    commVector<SiteHalfSpinor>  &buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
@@ -111,14 +111,14 @@ namespace Grid {
 #undef KERNEL_DAG
     template<>
     void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  commVector<SiteHalfSpinor>  &buf,
 								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
 #define KERNEL_DAG
     template<>
     void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     commVector<SiteHalfSpinor>  &buf,
 								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
@@ -127,10 +127,10 @@ namespace Grid {
 
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                   commVector<SiteHalfSpinor>  &buf,\
                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                   commVector<SiteHalfSpinor>  &buf,\
                                   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 
 
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 15c8ab56..9d7eac23 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -313,7 +313,7 @@ namespace QCD {
 
   template<class Impl>
   void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       commVector<SiteHalfSpinor>  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
   typedef typename Simd::scalar_type S;
@@ -556,7 +556,7 @@ namespace QCD {
 
   template<class Impl>
   void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       commVector<SiteHalfSpinor>  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
   //  std::cout << "Hand op Dhop "<<std::endl;
@@ -804,7 +804,7 @@ namespace QCD {
   ////////////////////////////////////////////////
 template<>
 void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     commVector<SiteHalfSpinor>  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
@@ -812,7 +812,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Leb
 
 template<>
 void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								commVector<SiteHalfSpinor>  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
@@ -820,7 +820,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 
 template<>
 void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     commVector<SiteHalfSpinor>  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
@@ -828,7 +828,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Leb
 
 template<>
 void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								commVector<SiteHalfSpinor>  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
   assert(0);
@@ -841,10 +841,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 
 #define INSTANTIATE_THEM(A) \
 template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       commVector<SiteHalfSpinor>  &buf,\
 							       int ss,int sU,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+								  commVector<SiteHalfSpinor>  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);
 
 INSTANTIATE_THEM(WilsonImplF);

From 5b5925b8e5aa454467f9b53d35dcd9f779a302b2 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 20 Oct 2016 17:09:40 +0100
Subject: [PATCH 12/16] Forgot to add

---
 lib/AlignedAllocator.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index d0de1ec3..89731246 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -161,7 +161,8 @@ public:
 #error "implement MPI3 windowed allocate"
   }
 #else
-  pointer allocate(size_type __n, const void* _p= 0)
+  pointer allocate(size_type __n, const void* _p= 0) 
+  {
 #ifdef HAVE_MM_MALLOC_H
     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
@@ -187,9 +188,9 @@ template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-template<class T> using Vector    = std::vector<T,alignedAllocator<T> >;           
+template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,commAllocator<T> >;              
-template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
+template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
     
 }; // namespace Grid
 #endif

From 202078eb1b9d02aa5eee05f1ffe0528801a3ef05 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 09:07:20 +0100
Subject: [PATCH 13/16] Cray / OpenSHMEM ordering differs

---
 lib/AlignedAllocator.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 89731246..8301d811 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -130,8 +130,12 @@ public:
 
 #ifdef GRID_COMMS_SHMEM
   pointer allocate(size_type __n, const void* _p= 0)
-  { 
+  {
+#ifdef CRAY
     _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+#else
+    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+#endif
 #ifdef PARANOID_SYMMETRIC_HEAP
     static void * bcast;
     static long  psync[_SHMEM_REDUCE_SYNC_SIZE];

From 20a091c3eddfdb67a82ece6413740a93650a2f98 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Fri, 21 Oct 2016 09:08:36 +0100
Subject: [PATCH 14/16] Intel vs. Clang intrinsics differences absorbed

---
 lib/simd/Grid_avx512.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 821898d9..d4fd446c 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -41,6 +41,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 namespace Grid{
 namespace Optimization {
+
+  template<class vtype>
+  union uconv {
+    __m512 f;
+    vtype v;
+  };
+
+  union u512f {
+    __m512 v;
+    float f[8];
+  };
+
+  union u512d {
+    __m512 v;
+    double f[4];
+  };
   
   struct Vsplat{
     //Complex float
@@ -361,8 +377,9 @@ namespace Optimization {
   // Some Template specialization
 
   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#undef GNU_CLANG_COMPILER 
-#ifdef GNU_CLANG_COMPILER
+
+#ifndef __INTEL_COMPILER
+#warning "Slow reduction due to incomplete reduce intrinsics"
   //Complex float Reduce
   template<>
     inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){

From 63d219498ba4a036d74c8520e7455c94543cff49 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 21 Oct 2016 13:10:13 +0100
Subject: [PATCH 15/16] first (dirty) implementation of Feynman stoctachtic EM
 field

---
 lib/qcd/action/gauge/Photon.h | 344 ++++++++++++++++++++++------------
 1 file changed, 226 insertions(+), 118 deletions(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index 29d40f8b..ea6f21b9 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -1,126 +1,234 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/gauge/Photon.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/action/gauge/Photon.h
+ 
+ Copyright (C) 2015
+ 
+ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 
 namespace Grid{
-  namespace QCD{
+namespace QCD{
+  
+  template<class Gimpl>
+  class Photon
+  {
+  public:
+    INHERIT_GIMPL_TYPES(Gimpl);
+    enum class Gauge    {Feynman, Coulomb, Landau};
+    enum class ZmScheme {QedL, QedTL};
+  public:
+    Photon(Gauge gauge, ZmScheme zmScheme);
+    virtual ~Photon(void) = default;
+    void FreePropagator(const GaugeField &in, GaugeField &out);
+    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
+    void StochasticField(GaugeField &out, GridParallelRNG &rng);
+  private:
+    void kHatSquared(GaugeLinkField &out);
+    void zmSub(GaugeLinkField &out);
+  private:
+    Gauge    gauge_;
+    ZmScheme zmScheme_;
+  };
 
-    template<class Gimpl>
-    class Photon  {
-      
-    public:
-
-      INHERIT_GIMPL_TYPES(Gimpl);
-
-      enum PhotonType { FEYNMAN_L, FEYNMAN_TL };
-
-      PhotonType GaugeBC;
-      
-      Photon(PhotonType _GaugeBC) : GaugeBC(_GaugeBC){};
-
-      void  FreePropagator (const GaugeField &in,GaugeField &out) {
-	FFT theFFT((GridCartesian *) in._grid);
-
-	GaugeField in_k(in._grid);
-	GaugeField prop_k(in._grid);
-
-	theFFT.FFT_all_dim(in_k,in,FFT::forward);
-        MomentumSpacePropagator(prop_k,in_k);
-	theFFT.FFT_all_dim(out,prop_k,FFT::backward);
-      }
-      void  MomentumSpacePropagator(GaugeField &out,const GaugeField &in) {
-	if ( GaugeBC == FEYNMAN_L ) { 
-	  FeynmanGaugeMomentumSpacePropagator_L(out,in);
-	} else if ( GaugeBC == FEYNMAN_TL ) { 
-	  FeynmanGaugeMomentumSpacePropagator_TL(out,in);
-	} else {  // No coulomb yet
-	  assert(0);
-	}
-      } 
-      void  FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out, const GaugeField &in) { 
-
-	FeynmanGaugeMomentumSpacePropagator_TL(out,in);
-
-	GridBase *grid = out._grid;
-	LatticeInteger     coor(grid);
-	GaugeField zz(grid); zz=zero;
-
-	// xyzt
-	for(int d = 0; d < grid->_ndimension-1;d++){
-	  LatticeCoordinate(coor,d);
-	  out = where(coor==Integer(0),zz,out);
-	}
-      }
-
-      void  FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out, const GaugeField &in) { 
-
-	// what type LatticeComplex 
-	GridBase *grid = out._grid;
-	int nd = grid->_ndimension;
-
-	typedef typename GaugeField::vector_type vector_type;
-	typedef typename GaugeField::scalar_type ScalComplex;
-	typedef Lattice<iSinglet<vector_type> > LatComplex;
+  template<class Gimpl>
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
+  : gauge_(gauge), zmScheme_(zmScheme)
+  {}
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
+  {
+    FFT theFFT(in._grid);
     
-	std::vector<int> latt_size   = grid->_fdimensions;
-
-	LatComplex denom(grid); denom= zero;
-	LatComplex   one(grid); one = ScalComplex(1.0,0.0);
-	LatComplex   kmu(grid); 
-
-	ScalComplex ci(0.0,1.0);
-	// momphase = n * 2pi / L
-	for(int mu=0;mu<Nd;mu++) {
-	  
-	  LatticeCoordinate(kmu,mu);
-	  
-	  RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-
-	  kmu = TwoPiL * kmu ;
-	  
-	  denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
-	}
-	std::vector<int> zero_mode(nd,0);
-	TComplexD Tone = ComplexD(1.0,0.0);
-	TComplexD Tzero= ComplexD(0.0,0.0);
-
-	pokeSite(Tone,denom,zero_mode); 
-
-	denom= one/denom;
-	
-	pokeSite(Tzero,denom,zero_mode); 
-
-	out = zero;
-	out = in*denom;
-      };
-
-    };
-
+    GaugeField in_k(in._grid);
+    GaugeField prop_k(in._grid);
+    
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
+    MomentumSpacePropagator(prop_k,in_k);
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::kHatSquared(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    GaugeLinkField     kmu(grid);
+    const unsigned int nd    = grid->_ndimension;
+    std::vector<int>   &l    = grid->_fdimensions;
+    
+    out = zero;
+    for(int mu = 0; mu < nd; mu++)
+    {
+      RealD twoPiL = M_PI*2./l[mu];
+      
+      LatticeCoordinate(kmu,mu);
+      kmu = 2.*sin(.5*twoPiL*kmu);
+      out = out + kmu*kmu;
+    }
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
+  {
+    GridBase           *grid = out._grid;
+    const unsigned int nd    = grid->_ndimension;
+    
+    switch (zmScheme_)
+    {
+      case ZmScheme::QedTL:
+      {
+        std::vector<int> zm(nd,0);
+        TComplexD        Tzero = ComplexD(0.0,0.0);
+        
+        pokeSite(Tzero, out, zm);
+        
+        break;
+      }
+      case ZmScheme::QedL:
+      {
+        LatticeInteger spNrm(grid), coor(grid);
+        GaugeLinkField z(grid);
+        
+        spNrm = zero;
+        for(int d = 0; d < grid->_ndimension - 1; d++)
+        {
+          LatticeCoordinate(coor,d);
+          spNrm = spNrm + coor*coor;
+        }
+        out = where(spNrm == 0, 0.*out, out);
+        
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
+                                               GaugeField &out)
+  {
+    GridBase           *grid = out._grid;
+    const unsigned int nd    = grid->_ndimension;
+    LatticeComplex     k2Inv(grid), one(grid);
+    
+    one = ComplexD(1.0,0.0);
+    kHatSquared(k2Inv);
+    
+    std::vector<int> zm(nd,0);
+    TComplexD Tone = ComplexD(1.0,0.0);
+    TComplexD Tzero= ComplexD(0.0,0.0);
+    
+    pokeSite(Tone, k2Inv, zm);
+    k2Inv = one/k2Inv;
+    pokeSite(Tzero, k2Inv,zm);
+    
+    out = in*k2Inv;
+  }
+  
+  template<class Gimpl>
+  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
+  {
+    auto               *grid = out._grid;
+    const unsigned int nd = grid->_ndimension;
+    GaugeLinkField     k2(grid), r(grid);
+    GaugeField         aTilde(grid);
+    FFT                fft(grid);
+    
+    kHatSquared(k2);
+    zmSub(k2);
+    for(int mu = 0; mu < nd; mu++)
+    {
+      gaussian(rng, r);
+      r = k2*r;
+      pokeLorentz(aTilde, r, mu);
+    }
+    fft.FFT_all_dim(out, aTilde, FFT::backward);
+  }
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
+//                                                            const GaugeField &in)
+//  {
+//    
+//    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
+//    
+//    GridBase *grid = out._grid;
+//    LatticeInteger     coor(grid);
+//    GaugeField zz(grid); zz=zero;
+//    
+//    // xyzt
+//    for(int d = 0; d < grid->_ndimension-1;d++){
+//      LatticeCoordinate(coor,d);
+//      out = where(coor==Integer(0),zz,out);
+//    }
+//  }
+//  
+//  template<class Gimpl>
+//  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
+//                                                             const GaugeField &in)
+//  {
+//    
+//    // what type LatticeComplex
+//    GridBase *grid = out._grid;
+//    int nd = grid->_ndimension;
+//    
+//    typedef typename GaugeField::vector_type vector_type;
+//    typedef typename GaugeField::scalar_type ScalComplex;
+//    typedef Lattice<iSinglet<vector_type> > LatComplex;
+//    
+//    std::vector<int> latt_size   = grid->_fdimensions;
+//    
+//    LatComplex denom(grid); denom= zero;
+//    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
+//    LatComplex   kmu(grid);
+//    
+//    ScalComplex ci(0.0,1.0);
+//    // momphase = n * 2pi / L
+//    for(int mu=0;mu<Nd;mu++) {
+//      
+//      LatticeCoordinate(kmu,mu);
+//      
+//      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+//      
+//      kmu = TwoPiL * kmu ;
+//      
+//      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
+//    }
+//    std::vector<int> zero_mode(nd,0);
+//    TComplexD Tone = ComplexD(1.0,0.0);
+//    TComplexD Tzero= ComplexD(0.0,0.0);
+//    
+//    pokeSite(Tone,denom,zero_mode);
+//    
+//    denom= one/denom;
+//    
+//    pokeSite(Tzero,denom,zero_mode);
+//    
+//    out = zero;
+//    out = in*denom;
+//  };
+  
 }}
 #endif

From 462921e5491fbef99c93cfc460819890b0a63b66 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 21 Oct 2016 14:41:08 +0100
Subject: [PATCH 16/16] QED: fix stochastic field

---
 lib/qcd/action/gauge/Photon.h | 37 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/lib/qcd/action/gauge/Photon.h b/lib/qcd/action/gauge/Photon.h
index ea6f21b9..aac72898 100644
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -45,7 +45,7 @@ namespace QCD{
     void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
     void StochasticField(GaugeField &out, GridParallelRNG &rng);
   private:
-    void kHatSquared(GaugeLinkField &out);
+    void invKHatSquared(GaugeLinkField &out);
     void zmSub(GaugeLinkField &out);
   private:
     Gauge    gauge_;
@@ -71,13 +71,17 @@ namespace QCD{
   }
   
   template<class Gimpl>
-  void Photon<Gimpl>::kHatSquared(GaugeLinkField &out)
+  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
   {
     GridBase           *grid = out._grid;
-    GaugeLinkField     kmu(grid);
+    GaugeLinkField     kmu(grid), one(grid);
     const unsigned int nd    = grid->_ndimension;
     std::vector<int>   &l    = grid->_fdimensions;
+    std::vector<int>   zm(nd,0);
+    TComplexD          Tone = ComplexD(1.0,0.0);
+    TComplexD          Tzero= ComplexD(0.0,0.0);
     
+    one = ComplexD(1.0,0.0);
     out = zero;
     for(int mu = 0; mu < nd; mu++)
     {
@@ -87,6 +91,9 @@ namespace QCD{
       kmu = 2.*sin(.5*twoPiL*kmu);
       out = out + kmu*kmu;
     }
+    pokeSite(Tone, out, zm);
+    out = one/out;
+    pokeSite(Tzero, out,zm);
   }
   
   template<class Gimpl>
@@ -131,19 +138,10 @@ namespace QCD{
                                                GaugeField &out)
   {
     GridBase           *grid = out._grid;
-    const unsigned int nd    = grid->_ndimension;
-    LatticeComplex     k2Inv(grid), one(grid);
+    LatticeComplex     k2Inv(grid);
     
-    one = ComplexD(1.0,0.0);
-    kHatSquared(k2Inv);
-    
-    std::vector<int> zm(nd,0);
-    TComplexD Tone = ComplexD(1.0,0.0);
-    TComplexD Tzero= ComplexD(0.0,0.0);
-    
-    pokeSite(Tone, k2Inv, zm);
-    k2Inv = one/k2Inv;
-    pokeSite(Tzero, k2Inv,zm);
+    invKHatSquared(k2Inv);
+    zmSub(k2Inv);
     
     out = in*k2Inv;
   }
@@ -153,16 +151,17 @@ namespace QCD{
   {
     auto               *grid = out._grid;
     const unsigned int nd = grid->_ndimension;
-    GaugeLinkField     k2(grid), r(grid);
+    GaugeLinkField     sqrtK2Inv(grid), r(grid);
     GaugeField         aTilde(grid);
     FFT                fft(grid);
     
-    kHatSquared(k2);
-    zmSub(k2);
+    invKHatSquared(sqrtK2Inv);
+    sqrtK2Inv = sqrt(real(sqrtK2Inv));
+    zmSub(sqrtK2Inv);
     for(int mu = 0; mu < nd; mu++)
     {
       gaussian(rng, r);
-      r = k2*r;
+      r = sqrtK2Inv*r;
       pokeLorentz(aTilde, r, mu);
     }
     fft.FFT_all_dim(out, aTilde, FFT::backward);