Merge branch 'feature/feynman-rules' into feature/qed-fvol

2025-10-27 18:19:34 +00:00 · 2016-10-21 14:42:18 +01:00
parent 6e4a06e180 462921e549
commit ab31ad006a
17 changed files with 781 additions and 213 deletions
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ac_COMMS} in
     mpi)
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     mpi3)
       AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
     ;;
     shmem)
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
@@ -253,6 +256,7 @@ case ${ac_COMMS} in
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ############### RNG selection
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <mm_malloc.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #endif
 namespace Grid {
 ////////////////////////////////////////////////////////////////////
@@ -65,28 +57,85 @@ public:
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
  alignedAllocator() throw() { }
  alignedAllocator(const alignedAllocator&) throw() { }
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  //  const_pointer address(const_reference __x) const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 //////////////////////////////////////////////////////////////////////////////////////////
 // MPI3 : comms must use shm region
 // SHMEM: comms must use symmetric heap
 //////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
-
+extern "C" { 
-    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+#include <mpp/shmem.h>
-
+extern void * shmem_align(size_t, size_t);
-
+extern void  shmem_free(void *);
 }
 #define PARANOID_SYMMETRIC_HEAP
 #endif
 template<typename _Tp>
 class commAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
  commAllocator() throw() { }
  commAllocator(const commAllocator&) throw() { }
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
  ~commAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
 #ifdef CRAY
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #else
    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
@@ -99,51 +148,53 @@ public:
      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
    shmem_free((void *)__p);
  }
 #elif defined(GRID_COMMS_MPI3)
  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #error "implement MPI3 windowed allocate"
  }
  void deallocate(pointer __p, size_type) { 
 #error "implement MPI3 windowed allocate"
  }
 #else
-
+  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef GRID_COMMS_SHMEM
    shmem_free((void *)__p);
 #else
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
 #endif
  }
 #endif
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
-template<typename _Tp>  inline bool
+////////////////////////////////////////////////////////////////////////////////
-operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+// Template typedefs
-
+////////////////////////////////////////////////////////////////////////////////
-template<typename _Tp>  inline bool
+template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
-operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+template<class T> using commVector = std::vector<T,commAllocator<T> >;              
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 }; // namespace Grid
 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -3,6 +3,10 @@ if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
 endif
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -71,7 +71,7 @@
 namespace Grid {
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
 {
  table.resize(0);
  int rd = rhs._grid->_rdimensions[dimension];
@@ -109,7 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali
 }
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
 			   compressor &compress, int off,int so)
 {
 PARALLEL_FOR_LOOP     
@@ -119,7 +119,7 @@ PARALLEL_FOR_LOOP
 }
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
+Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 			     double &t_table ,double & t_data )
 {
  std::vector<std::pair<int,int> > table;
@@ -347,10 +347,10 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
       }
       // Comms buffers
-       std::vector<Vector<scalar_object> > u_simd_send_buf;
+       std::vector<commVector<scalar_object> > u_simd_send_buf;
-       std::vector<Vector<scalar_object> > u_simd_recv_buf;
+       std::vector<commVector<scalar_object> > u_simd_recv_buf;
-       Vector<cobj>          u_send_buf;
+       commVector<cobj>          u_send_buf;
-       Vector<cobj>          comm_buf;
+       commVector<cobj>          comm_buf;
       int u_comm_offset;
       int _unified_buffer_size;
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
@@ -52,6 +55,29 @@ class CartesianCommunicator {
 #ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
 #elif  GRID_COMMS_MPI3
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
    const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
    std::vector<int>  WorldDims;
    std::vector<int>  GroupDims;
    std::vector<int>  ShmDims;
    std::vector<int> GroupCoor;
    std::vector<int> ShmCoor;
    std::vector<int> WorldCoor;
    int GroupRank;
    int ShmRank;
    int WorldRank;
    int GroupSize;
    int ShmSize;
    int WorldSize;
    std::vector<int>  LexicographicToWorldRank;
 #else 
    typedef int CommsRequest_t;
 #endif
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -0,0 +1,358 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 namespace Grid {
 // Global used by Init and nowhere else. How to hide?
 int Rank(void) {
  int pe;
  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
  return pe;
 }
  // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
 }
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Want to implement some magic ... Group sub-cubes into those on same node
  //
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  source = LexicographicToWorldRank[source];
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  _ndimension = processors.size();
  std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
  for(int d =0;d<_ndimension;d++){
    std::cout << processors[d]<<" ";
  };
  std::cout << std::endl;
  WorldDims = processors;
  communicator = MPI_COMM_WORLD;
  MPI_Comm shmcomm;
  MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
  MPI_Comm_rank(communicator,&WorldRank);
  MPI_Comm_size(communicator,&WorldSize);
  MPI_Comm_rank(shmcomm     ,&ShmRank);
  MPI_Comm_size(shmcomm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  std::cout<< "Ranks per node "<< ShmSize << std::endl;
  std::cout<< "Nodes          "<< GroupSize << std::endl;
  std::cout<< "Ranks          "<< WorldSize << std::endl;
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = -1;
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
    if ( (0x1<<i) == ShmSize ) {
      log2size = i;
      break;
    }
  }
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
  }
  std::cout << "Shm group dims "<<std::endl;
  for(int d =0;d<_ndimension;d++){
    std::cout << ShmDims[d]<<" ";
  };
  std::cout << std::endl;
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<_ndimension;d++){
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  std::cout << "Group dims "<<std::endl;
  for(int d =0;d<_ndimension;d++){
    std::cout << GroupDims[d]<<" ";
  };
  std::cout << std::endl;
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator, &WorldGroup); 
  MPI_Comm_group (shmcomm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  std::vector<int> group_ranks(WorldSize); 
  std::vector<int> mygroup(GroupSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]); 
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  for(int i=0;i<_ndimension;i++){
    std::cout << " p " << _processors[i]<<std::endl;
    _Nprocessors*=_processors[i];
  }
  std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
  assert(WorldSize==_Nprocessors);
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
  ///////////////////////////////////////////////////////////////////
  int g=0;
  for(int rank=0;rank<WorldSize;rank++){
    if(group_ranks[rank]!=MPI_UNDEFINED){
 	  mygroup[g] = rank;
    }
  }
  std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
  int myleader = mygroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  // 
  ////////////////////////////////////////////////////////////////
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
  int lexico;
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico]=WorldRank;
  _processor = lexico;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      MPI_COMM_WORLD);
  assert(ierr==0);
 }
 }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -45,7 +45,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2
 	int o      =   n*n1;
 	int offset = b+n*n2;
 	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);
      }
@@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2
  } else { 
    assert(0); //Fixme think this is buggy
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
@@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
-  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
-  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -65,9 +65,6 @@ public:
 class LatticeExpressionBase {};
 template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -45,7 +45,7 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
    int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -222,7 +222,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptGenericDhopSite(
    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
    int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -398,7 +398,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 template <class Impl>
 void WilsonKernels<Impl>::DiracOptDhopDir(
    StencilImpl &st, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
+    commVector<SiteHalfSpinor> &buf, int sF,
    int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -58,7 +58,7 @@ namespace Grid {
      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
 	DiracOptDhopSite(
 			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			 commVector<SiteHalfSpinor> &buf,
 			 int sF, int sU, int Ls, int Ns, const FermionField &in,
 			 FermionField &out) {
 #ifdef AVX512
@@ -89,7 +89,7 @@ namespace Grid {
 	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 	  DiracOptDhopSite(
 			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			   commVector<SiteHalfSpinor> &buf,
 			   int sF, int sU, int Ls, int Ns, const FermionField &in,
 			   FermionField &out) {
 	  for (int site = 0; site < Ns; site++) {
@@ -107,7 +107,7 @@ namespace Grid {
 				  void>::type
 	  DiracOptDhopSiteDag(
 			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+			      commVector<SiteHalfSpinor> &buf,
 			      int sF, int sU, int Ls, int Ns, const FermionField &in,
 			      FermionField &out) {
 #ifdef AVX512
@@ -139,7 +139,7 @@ namespace Grid {
 				      void>::type
 				      DiracOptDhopSiteDag(
 							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							  commVector<SiteHalfSpinor> &buf,
 							  int sF, int sU, int Ls, int Ns, const FermionField &in,
 							  FermionField &out) {
 					for (int site = 0; site < Ns; site++) {
@@ -154,7 +154,7 @@ namespace Grid {
 				    void DiracOptDhopDir(
 							 StencilImpl &st, DoubledGaugeField &U,
-							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							 commVector<SiteHalfSpinor> &buf,
 							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
 							 int gamma);
@@ -162,34 +162,34 @@ namespace Grid {
 				    // Specialised variants
 				    void DiracOptGenericDhopSite(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 commVector<SiteHalfSpinor> &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptGenericDhopSiteDag(
 								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								    commVector<SiteHalfSpinor> &buf,
 								    int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptAsmDhopSite(
 							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							     commVector<SiteHalfSpinor> &buf,
 							     int sF, int sU, int Ls, int Ns, const FermionField &in,
 							     FermionField &out);
 				    void DiracOptAsmDhopSiteDag(
 								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								commVector<SiteHalfSpinor> &buf,
 								int sF, int sU, int Ls, int Ns, const FermionField &in,
 								FermionField &out);
 				    void DiracOptHandDhopSite(
 							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+							      commVector<SiteHalfSpinor> &buf,
 							      int sF, int sU, const FermionField &in, FermionField &out);
 				    void DiracOptHandDhopSiteDag(
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+								 commVector<SiteHalfSpinor> &buf,
 								 int sF, int sU, const FermionField &in, FermionField &out);
 	public:
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -40,14 +40,14 @@ namespace Grid {
    ///////////////////////////////////////////////////////////
    template<class Impl>
      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                             commVector<SiteHalfSpinor>  &buf,
                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
    template<class Impl>
      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+                                commVector<SiteHalfSpinor>  &buf,
                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
@@ -86,14 +86,14 @@ namespace Grid {
 #undef KERNEL_DAG
    template<>
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 commVector<SiteHalfSpinor>  &buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
    template<>
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    commVector<SiteHalfSpinor>  &buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@@ -111,14 +111,14 @@ namespace Grid {
 #undef KERNEL_DAG
    template<>
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  commVector<SiteHalfSpinor>  &buf,
 								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
    template<>
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     commVector<SiteHalfSpinor>  &buf,
 								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@@ -127,10 +127,10 @@ namespace Grid {
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                   commVector<SiteHalfSpinor>  &buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
-                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+                                   commVector<SiteHalfSpinor>  &buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -313,7 +313,7 @@ namespace QCD {
  template<class Impl>
  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       commVector<SiteHalfSpinor>  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
@@ -556,7 +556,7 @@ namespace QCD {
  template<class Impl>
  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       commVector<SiteHalfSpinor>  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
@@ -804,7 +804,7 @@ namespace QCD {
  ////////////////////////////////////////////////
 template<>
 void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     commVector<SiteHalfSpinor>  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -812,7 +812,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Leb
 template<>
 void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								commVector<SiteHalfSpinor>  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -820,7 +820,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 template<>
 void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     commVector<SiteHalfSpinor>  &buf,
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -828,7 +828,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Leb
 template<>
 void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								commVector<SiteHalfSpinor>  &buf,
 								int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -841,10 +841,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 #define INSTANTIATE_THEM(A) \
 template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       commVector<SiteHalfSpinor>  &buf,\
 							       int ss,int sU,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+								  commVector<SiteHalfSpinor>  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);
 INSTANTIATE_THEM(WilsonImplF);
--- a/lib/qcd/action/gauge/Photon.h
+++ b/lib/qcd/action/gauge/Photon.h
@@ -1,126 +1,233 @@
-    /*************************************************************************************
+/*************************************************************************************
-
+ 
-    Grid physics library, www.github.com/paboyle/Grid 
+ Grid physics library, www.github.com/paboyle/Grid
-
+ 
-    Source file: ./lib/qcd/action/gauge/Photon.h
+ Source file: ./lib/qcd/action/gauge/Photon.h
-
+ 
-    Copyright (C) 2015
+ Copyright (C) 2015
-
+ 
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
+ 
-    This program is free software; you can redistribute it and/or modify
+ This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+ it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+ the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+ (at your option) any later version.
-
+ 
-    This program is distributed in the hope that it will be useful,
+ This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+ GNU General Public License for more details.
-
+ 
-    You should have received a copy of the GNU General Public License along
+ You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+ with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+ 
-    See the full license in the file "LICENSE" in the top level distribution directory
+ See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+ *************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 #ifndef QCD_PHOTON_ACTION_H
 #define QCD_PHOTON_ACTION_H
 namespace Grid{
-  namespace QCD{
+namespace QCD{
  template<class Gimpl>
  class Photon
  {
  public:
    INHERIT_GIMPL_TYPES(Gimpl);
    enum class Gauge    {Feynman, Coulomb, Landau};
    enum class ZmScheme {QedL, QedTL};
  public:
    Photon(Gauge gauge, ZmScheme zmScheme);
    virtual ~Photon(void) = default;
    void FreePropagator(const GaugeField &in, GaugeField &out);
    void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
    void StochasticField(GaugeField &out, GridParallelRNG &rng);
  private:
    void invKHatSquared(GaugeLinkField &out);
    void zmSub(GaugeLinkField &out);
  private:
    Gauge    gauge_;
    ZmScheme zmScheme_;
  };
-    template<class Gimpl>
+  template<class Gimpl>
-    class Photon  {
+  Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
-      
+  : gauge_(gauge), zmScheme_(zmScheme)
-    public:
+  {}
-
+  
-      INHERIT_GIMPL_TYPES(Gimpl);
+  template<class Gimpl>
-
+  void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
-      enum PhotonType { FEYNMAN_L, FEYNMAN_TL };
+  {
-
+    FFT theFFT(in._grid);
      PhotonType GaugeBC;
      Photon(PhotonType _GaugeBC) : GaugeBC(_GaugeBC){};
      void  FreePropagator (const GaugeField &in,GaugeField &out) {
 	FFT theFFT((GridCartesian *) in._grid);
 	GaugeField in_k(in._grid);
 	GaugeField prop_k(in._grid);
 	theFFT.FFT_all_dim(in_k,in,FFT::forward);
        MomentumSpacePropagator(prop_k,in_k);
 	theFFT.FFT_all_dim(out,prop_k,FFT::backward);
      }
      void  MomentumSpacePropagator(GaugeField &out,const GaugeField &in) {
 	if ( GaugeBC == FEYNMAN_L ) { 
 	  FeynmanGaugeMomentumSpacePropagator_L(out,in);
 	} else if ( GaugeBC == FEYNMAN_TL ) { 
 	  FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 	} else {  // No coulomb yet
 	  assert(0);
 	}
      } 
      void  FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out, const GaugeField &in) { 
 	FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 	GridBase *grid = out._grid;
 	LatticeInteger     coor(grid);
 	GaugeField zz(grid); zz=zero;
 	// xyzt
 	for(int d = 0; d < grid->_ndimension-1;d++){
 	  LatticeCoordinate(coor,d);
 	  out = where(coor==Integer(0),zz,out);
 	}
      }
      void  FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out, const GaugeField &in) { 
 	// what type LatticeComplex 
 	GridBase *grid = out._grid;
 	int nd = grid->_ndimension;
 	typedef typename GaugeField::vector_type vector_type;
 	typedef typename GaugeField::scalar_type ScalComplex;
 	typedef Lattice<iSinglet<vector_type> > LatComplex;
-	std::vector<int> latt_size   = grid->_fdimensions;
+    GaugeField in_k(in._grid);
-
+    GaugeField prop_k(in._grid);
-	LatComplex denom(grid); denom= zero;
+    
-	LatComplex   one(grid); one = ScalComplex(1.0,0.0);
+    theFFT.FFT_all_dim(in_k,in,FFT::forward);
-	LatComplex   kmu(grid); 
+    MomentumSpacePropagator(prop_k,in_k);
-
+    theFFT.FFT_all_dim(out,prop_k,FFT::backward);
-	ScalComplex ci(0.0,1.0);
+  }
-	// momphase = n * 2pi / L
+  
-	for(int mu=0;mu<Nd;mu++) {
+  template<class Gimpl>
-	  
+  void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
-	  LatticeCoordinate(kmu,mu);
+  {
-	  
+    GridBase           *grid = out._grid;
-	  RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    GaugeLinkField     kmu(grid), one(grid);
-
+    const unsigned int nd    = grid->_ndimension;
-	  kmu = TwoPiL * kmu ;
+    std::vector<int>   &l    = grid->_fdimensions;
-	  
+    std::vector<int>   zm(nd,0);
-	  denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
+    TComplexD          Tone = ComplexD(1.0,0.0);
-	}
+    TComplexD          Tzero= ComplexD(0.0,0.0);
-	std::vector<int> zero_mode(nd,0);
+    
-	TComplexD Tone = ComplexD(1.0,0.0);
+    one = ComplexD(1.0,0.0);
-	TComplexD Tzero= ComplexD(0.0,0.0);
+    out = zero;
-
+    for(int mu = 0; mu < nd; mu++)
-	pokeSite(Tone,denom,zero_mode); 
+    {
-
+      RealD twoPiL = M_PI*2./l[mu];
-	denom= one/denom;
+      
-	
+      LatticeCoordinate(kmu,mu);
-	pokeSite(Tzero,denom,zero_mode); 
+      kmu = 2.*sin(.5*twoPiL*kmu);
-
+      out = out + kmu*kmu;
-	out = zero;
+    }
-	out = in*denom;
+    pokeSite(Tone, out, zm);
-      };
+    out = one/out;
-
+    pokeSite(Tzero, out,zm);
-    };
+  }
-
+  
  template<class Gimpl>
  void Photon<Gimpl>::zmSub(GaugeLinkField &out)
  {
    GridBase           *grid = out._grid;
    const unsigned int nd    = grid->_ndimension;
    switch (zmScheme_)
    {
      case ZmScheme::QedTL:
      {
        std::vector<int> zm(nd,0);
        TComplexD        Tzero = ComplexD(0.0,0.0);
        pokeSite(Tzero, out, zm);
        break;
      }
      case ZmScheme::QedL:
      {
        LatticeInteger spNrm(grid), coor(grid);
        GaugeLinkField z(grid);
        spNrm = zero;
        for(int d = 0; d < grid->_ndimension - 1; d++)
        {
          LatticeCoordinate(coor,d);
          spNrm = spNrm + coor*coor;
        }
        out = where(spNrm == 0, 0.*out, out);
        break;
      }
      default:
        break;
    }
  }
  template<class Gimpl>
  void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
                                               GaugeField &out)
  {
    GridBase           *grid = out._grid;
    LatticeComplex     k2Inv(grid);
    invKHatSquared(k2Inv);
    zmSub(k2Inv);
    out = in*k2Inv;
  }
  template<class Gimpl>
  void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
  {
    auto               *grid = out._grid;
    const unsigned int nd = grid->_ndimension;
    GaugeLinkField     sqrtK2Inv(grid), r(grid);
    GaugeField         aTilde(grid);
    FFT                fft(grid);
    invKHatSquared(sqrtK2Inv);
    sqrtK2Inv = sqrt(real(sqrtK2Inv));
    zmSub(sqrtK2Inv);
    for(int mu = 0; mu < nd; mu++)
    {
      gaussian(rng, r);
      r = sqrtK2Inv*r;
      pokeLorentz(aTilde, r, mu);
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
 //                                                            const GaugeField &in)
 //  {
 //    
 //    FeynmanGaugeMomentumSpacePropagator_TL(out,in);
 //    
 //    GridBase *grid = out._grid;
 //    LatticeInteger     coor(grid);
 //    GaugeField zz(grid); zz=zero;
 //    
 //    // xyzt
 //    for(int d = 0; d < grid->_ndimension-1;d++){
 //      LatticeCoordinate(coor,d);
 //      out = where(coor==Integer(0),zz,out);
 //    }
 //  }
 //  
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
 //                                                             const GaugeField &in)
 //  {
 //    
 //    // what type LatticeComplex
 //    GridBase *grid = out._grid;
 //    int nd = grid->_ndimension;
 //    
 //    typedef typename GaugeField::vector_type vector_type;
 //    typedef typename GaugeField::scalar_type ScalComplex;
 //    typedef Lattice<iSinglet<vector_type> > LatComplex;
 //    
 //    std::vector<int> latt_size   = grid->_fdimensions;
 //    
 //    LatComplex denom(grid); denom= zero;
 //    LatComplex   one(grid); one = ScalComplex(1.0,0.0);
 //    LatComplex   kmu(grid);
 //    
 //    ScalComplex ci(0.0,1.0);
 //    // momphase = n * 2pi / L
 //    for(int mu=0;mu<Nd;mu++) {
 //      
 //      LatticeCoordinate(kmu,mu);
 //      
 //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
 //      
 //      kmu = TwoPiL * kmu ;
 //      
 //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
 //    }
 //    std::vector<int> zero_mode(nd,0);
 //    TComplexD Tone = ComplexD(1.0,0.0);
 //    TComplexD Tzero= ComplexD(0.0,0.0);
 //    
 //    pokeSite(Tone,denom,zero_mode);
 //    
 //    denom= one/denom;
 //    
 //    pokeSite(Tzero,denom,zero_mode);
 //    
 //    out = zero;
 //    out = in*denom;
 //  };
 }}
 #endif
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -41,6 +41,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid{
 namespace Optimization {
  template<class vtype>
  union uconv {
    __m512 f;
    vtype v;
  };
  union u512f {
    __m512 v;
    float f[8];
  };
  union u512d {
    __m512 v;
    double f[4];
  };
  struct Vsplat{
    //Complex float
@@ -372,8 +388,9 @@ namespace Optimization {
  // Some Template specialization
  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#undef GNU_CLANG_COMPILER 
+
-#ifdef GNU_CLANG_COMPILER
+#ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  }
 }
-template<class vobj> inline 
+template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
@@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  }}
 }
-template<class vobj> inline 
+template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;