Merge branch 'master' of github.com:paboyle/Grid

Conflicts: lib/qcd/action/fermion/WilsonFermion5D.cc
2025-07-26 09:17:08 +01:00 · 2015-11-06 03:50:04 -08:00
parent d29b4c1dee 28022755ae
commit 899ca41cb8
20 changed files with 559 additions and 8450 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -93,12 +93,13 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;

-    std::cout<<GridLogMessage << "Called Dw"<<std::endl;
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    Dw.Report();
  }


--- a/8040
+++ b/8040
--- a/configure.ac
+++ b/configure.ac
@@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
+AC_CHECK_HEADERS(execinfo.h)
 AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -16,8 +16,9 @@
 #include <iterator>

 #define __X86_64
-#define EXECINFO
-#ifdef EXECINFO
+
+
+#ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif

@@ -233,7 +234,9 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
  printf("         code %d\n",si->si_code);

-#ifdef __X86_64
+  // Linux/Posix
+#ifdef __linux__ 
+  // And x86 64bit
    ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
@@ -259,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r14);
  REG(r15);
 #endif
-#ifdef EXECINFO
+#ifdef HAVE_EXECINFO_H
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
  for (int i = 0; i < symbols; i++){
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -48,10 +48,14 @@ namespace Grid {
    int _around_the_world;
  };

+  template<class vobj,class cobj, class compressor>
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
  public:

      typedef uint32_t StencilInteger;
+      typedef typename cobj::vector_type vector_type;
+      typedef typename cobj::scalar_type scalar_type;
+      typedef typename cobj::scalar_object scalar_object;

      int                               _checkerboard;
      int                               _npoints; // Move to template param?
@@ -66,33 +70,334 @@ namespace Grid {
      // npoints x Osites() of these
      std::vector<std::vector<StencilEntry> > _entries;

+      // Comms buffers
+      std::vector<std::vector<scalar_object> > send_buf_extract;
+      std::vector<std::vector<scalar_object> > recv_buf_extract;
+      std::vector<scalar_object *> pointers;
+      std::vector<scalar_object *> rpointers;
+      Vector<cobj> send_buf;
+
      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }

      int _unified_buffer_size;
      int _request_count;

+      double buftime;
+      double gathertime;
+      double commtime;
+      double commstime;
+      double halotime;
+      double scattertime;
+      double mergetime;
+      double gathermtime;
+      double splicetime;
+      double nosplicetime;

-      CartesianStencil(GridBase *grid,
-		       int npoints,
-		       int checkerboard,
-		       const std::vector<int> &directions,
-		       const std::vector<int> &distances);
+
+
+
+  CartesianStencil(GridBase *grid,
+				     int npoints,
+				     int checkerboard,
+				     const std::vector<int> &directions,
+				     const std::vector<int> &distances) 
+    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
+    {
+      gathertime=0;
+      commtime=0;
+      commstime=0;
+      halotime=0;
+      scattertime=0;
+      mergetime=0;
+      gathermtime=0;
+      buftime=0;
+      splicetime=0;
+      nosplicetime=0;
+
+      _npoints = npoints;
+      _grid    = grid;
+      _directions = directions;
+      _distances  = distances;
+      _unified_buffer_size=0;
+      _request_count =0;
+
+      int osites  = _grid->oSites();
+
+      for(int i=0;i<npoints;i++){
+
+	int point = i;
+
+	_entries[i].resize( osites);
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+	int shift = displacement;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	_permute_type[point]=_grid->PermuteType(dimension);
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+	
+	// Underlying approach. For each local site build
+	// up a table containing the npoint "neighbours" and whether they 
+	// live in lattice or a comms buffer.
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Local(point,dimension,shift,0x3);
+	  } else {
+	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else { // All permute extract done in comms phase prior to Stencil application
+	  //        So tables are the same whether comm_dim or splice_dim
+	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
+	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	  if ( sshift[0] == sshift[1] ) {
+	    Comms(point,dimension,shift,0x3);
+	    //	    std::cout<<"Comms 0x3"<<std::endl;
+	  } else {
+	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
+	  }
+	}
+	//	for(int ss=0;ss<osites;ss++){
+	  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
+	  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
+	//	}
+      }
+    }
+
+
+    void Local     (int point, int dimension,int shiftpm,int cbmask)
+    {
+      int fd = _grid->_fdimensions[dimension];
+      int rd = _grid->_rdimensions[dimension];
+      int ld = _grid->_ldimensions[dimension];
+      int gd = _grid->_gdimensions[dimension];
+      
+      // Map to always positive shift modulo global full dimension.
+      int shift = (shiftpm+fd)%fd;
+      
+      // the permute type
+      int permute_dim =_grid->PermuteDim(dimension);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int o   = 0;
+	int bo  = x * _grid->_ostride[dimension];
+	
+	int cb= (cbmask==0x2)? Odd : Even;
+	  
+	int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+	int sx     = (x+sshift)%rd;
+
+	int wraparound=0;
+	if ( (shiftpm==-1) && (sx>x)  ) {
+	  wraparound = 1;
+	}
+	if ( (shiftpm== 1) && (sx<x)  ) {
+	  wraparound = 1;
+	}
+
+	  
+	int permute_slice=0;
+	if(permute_dim){
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	}
+
+  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
+  
+      }
+    }
+
+    void Comms     (int point,int dimension,int shiftpm,int cbmask)
+    {
+      GridBase *grid=_grid;
+      
+      int fd              = _grid->_fdimensions[dimension];
+      int ld              = _grid->_ldimensions[dimension];
+      int rd              = _grid->_rdimensions[dimension];
+      int pd              = _grid->_processors[dimension];
+      int simd_layout     = _grid->_simd_layout[dimension];
+      int comm_dim        = _grid->_processors[dimension] >1 ;
+      
+      //      assert(simd_layout==1); // Why?
+      assert(comm_dim==1);
+      int shift = (shiftpm + fd) %fd;
+      assert(shift>=0);
+      assert(shift<fd);
+      
+      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
+                                           // send to one or more remote nodes.
+
+      int cb= (cbmask==0x2)? Odd : Even;
+      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
+      
+
+      for(int x=0;x<rd;x++){       
+
+	int sx        =  (x+sshift)%rd;
+	int comm_proc = ((x+sshift)/rd)%pd;
+    	int offnode = (comm_proc!= 0);
+
+	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
+	int wraparound=0;
+	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
+	  wraparound = 1;
+	}
+	if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
+	  wraparound = 1;
+	}
+	if (!offnode) {
+	  
+	  int permute_slice=0;
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
+	  
+	} else {
+	  
+	  int words = buffer_size;
+	  if (cbmask != 0x3) words=words>>1;
+	  
+	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
+	  
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+
+	  int unified_buffer_offset = _unified_buffer_size;
+	  _unified_buffer_size    += words;
+	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
+	  
+	}
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int o   = 0;                                     // relative offset to base within plane
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
+	
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _entries[point][lo+o+b]._offset  =ro+o+b;
+	    _entries[point][lo+o+b]._is_local=1;
+	    _entries[point][lo+o+b]._permute=permute;
+	    _entries[point][lo+o+b]._around_the_world=wrap;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      } else {
+	
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                     // relative offset to base within plane
+	
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
+	    
+	    if ( ocb&cbmask ) {
+	      _entries[point][lo+o+b]._offset =ro+o+b;
+	      _entries[point][lo+o+b]._is_local=1;
+	      _entries[point][lo+o+b]._permute=permute;
+	      _entries[point][lo+o+b]._around_the_world=wrap;
+	    }
+	    
+	    }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+   void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                    // relative offset to base within plane
+	int bo  = 0;                                    // offset in buffer
+
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _entries[point][so+o+b]._offset  =offset+(bo++);
+	    _entries[point][so+o+b]._is_local=0;
+	    _entries[point][so+o+b]._permute=0;
+	    _entries[point][so+o+b]._around_the_world=wrap;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+
+      } else { 
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                      // relative offset to base within plane
+	int bo  = 0;                                      // offset in buffer
+
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	    if ( ocb & cbmask ) {
+	      _entries[point][so+o+b]._offset  =offset+(bo++);
+	      _entries[point][so+o+b]._is_local=0;
+	      _entries[point][so+o+b]._permute =0;
+	      _entries[point][so+o+b]._around_the_world=wrap;
+	    }
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+      }
+    }
+
+//      CartesianStencil(GridBase *grid,
+//		       int npoints,
+//		       int checkerboard,
+//		       const std::vector<int> &directions,
+//		       const std::vector<int> &distances);


      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
      // Can this be avoided with simpler coding of comms?
-      void Local     (int point, int dimension,int shift,int cbmask);
-      void Comms     (int point, int dimension,int shift,int cbmask);
-      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
-      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
+   //      void Local     (int point, int dimension,int shift,int cbmask);
+   //      void Comms     (int point, int dimension,int shift,int cbmask);
+   //      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
+   //      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);

      // Could allow a functional munging of the halo to another type during the comms.
      // this could implement the 16bit/32bit/64bit compression.
-      template<class vobj,class cobj, class compressor> void 
-	HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
+      void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
      {
 	// conformable(source._grid,_grid);
 	assert(source._grid==_grid);
+	halotime-=usecond();
 	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 	int u_comm_offset=0;

@@ -124,27 +429,35 @@ namespace Grid {
 	  if ( comm_dim ) {
 	    sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 	    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-	    //	    std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	    if ( sshift[0] == sshift[1] ) {
 	      if (splice_dim) {
+		splicetime-=usecond();
 		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		splicetime+=usecond();
 	      } else { 
+		nosplicetime-=usecond();
 		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
+		nosplicetime+=usecond();
 	      }
 	    } else {
+	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	      if(splice_dim){
+		splicetime-=usecond();
 		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
+		splicetime+=usecond();
 	      } else {
+		nosplicetime-=usecond();
 		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
+		nosplicetime+=usecond();
 	      }
 	    }
 	  }
 	}
+	halotime+=usecond();
      }

-      template<class vobj,class cobj, class compressor> 
        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 			      int &u_comm_offset,compressor & compress)
@@ -168,8 +481,7 @@ namespace Grid {

 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];

-	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
-	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
+	  if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);

 	  int cb= (cbmask==0x2)? Odd : Even;
 	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -186,7 +498,9 @@ namespace Grid {
 	    
 	      int bytes = words * sizeof(cobj);

+	      gathertime-=usecond();
 	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
+	      gathertime+=usecond();

 	      int rank           = _grid->_processor;
 	      int recv_from_rank;
@@ -196,32 +510,27 @@ namespace Grid {
 	      assert (recv_from_rank != _grid->ThisRank());

 	      //      FIXME Implement asynchronous send & also avoid buffer copy
+	      commtime-=usecond();
 	      _grid->SendToRecvFrom((void *)&send_buf[0],
 				   xmit_to_rank,
-				   (void *)&recv_buf[0],
+				    (void *)&u_comm_buf[u_comm_offset],
 				   recv_from_rank,
 				   bytes);
+	      commtime+=usecond();

-	      for(int i=0;i<words;i++){
-		u_comm_buf[u_comm_offset+i]=recv_buf[i];
-		//		std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<"  mask 0x"<<cbmask<<std::endl;
-	      }
 	      u_comm_offset+=words;
 	    }
 	  }
 	}


-      template<class vobj,class cobj, class compressor> 
 	void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 				   int &u_comm_offset,compressor &compress)
 	{
+	  buftime-=usecond();
 	  const int Nsimd = _grid->Nsimd();

-	  typedef typename cobj::vector_type vector_type;
-	  typedef typename cobj::scalar_type scalar_type;
-	  typedef typename cobj::scalar_object scalar_object;
 	  
 	  int fd = _grid->_fdimensions[dimension];
 	  int rd = _grid->_rdimensions[dimension];
@@ -244,17 +553,22 @@ namespace Grid {
 	  int words = sizeof(cobj)/sizeof(vector_type);

 	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
-	  /*
-	   * possibly slow to allocate
-	   * Doesn't matter in this test, but may want to preallocate in the 
-	   * dirac operators
-	   */
-	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
-	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
-	  int bytes = buffer_size*sizeof(scalar_object);

-	  std::vector<scalar_object *> pointers(Nsimd);  //
-	  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
+	  //	Should grow to max size and then cost very little thereafter
+	  send_buf_extract.resize(Nsimd);
+	  recv_buf_extract.resize(Nsimd);
+	  for(int l=0;l<Nsimd;l++){
+	    if( send_buf_extract[l].size() < buffer_size) {
+	      send_buf_extract[l].resize(buffer_size);
+	      recv_buf_extract[l].resize(buffer_size);
+	    }
+	  }
+	  pointers.resize(Nsimd);
+	  rpointers.resize(Nsimd);
+
+	  int bytes = buffer_size*sizeof(scalar_object);
+	  
+	  buftime+=usecond();
 	  
 	  ///////////////////////////////////////////
 	  // Work out what to send where
@@ -275,7 +589,9 @@ namespace Grid {
 	      }
 	      int sx   = (x+sshift)%rd;
 	      
+	      gathermtime-=usecond();
 	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
+	      gathermtime+=usecond();

 	      for(int i=0;i<Nsimd;i++){
 		
@@ -302,11 +618,13 @@ namespace Grid {
 		  
 		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 		  
+		  commstime-=usecond();
 		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 					xmit_to_rank,
 					(void *)&recv_buf_extract[i][0],
 					recv_from_rank,
 					bytes);
+		  commstime+=usecond();
 		  
 		  rpointers[i] = &recv_buf_extract[i][0];

@@ -316,11 +634,13 @@ namespace Grid {
 	      }

 	      // Here we don't want to scatter, just place into a buffer.
+	      mergetime-=usecond();
+PARALLEL_FOR_LOOP
 	      for(int i=0;i<buffer_size;i++){
-		assert(u_comm_offset+i<_unified_buffer_size);
+		//		assert(u_comm_offset+i<_unified_buffer_size);
 		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 	      }
-
+	      mergetime+=usecond();
 	      u_comm_offset+=buffer_size;
 	    }
 	  }
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -170,7 +170,7 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil Stencil; 
+    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 

    std::vector<CoarseMatrix> A;

--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
-    //PARALLEL_NESTED_LOOP21
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o  = n*rhs._grid->_slice_stride[dimension];
-      //      int bo = n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb &cbmask ) {
-	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+
+  if ( cbmask == 0x3 ) { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*rhs._grid->_slice_stride[dimension];
+	int bo = n*rhs._grid->_slice_block[dimension];
+	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
+  } else { 
+     int bo=0;
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	 if ( ocb &cbmask ) {
+	   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	 }
+       }
+     }
  }
 }

@@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  //PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
+  
+  if ( cbmask ==0x3){
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){

-      int o=n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];

-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb & cbmask ) {
-	cobj temp; 
-	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
+
+      }
+    }
+  } else { 
+
+    assert(0); //Fixme think this is buggy
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*rhs._grid->_slice_stride[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+
+	if ( ocb & cbmask ) {
+	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  extract<cobj>(temp,pointers,offset);
+	}
      }
    }
  }
@@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  int bo=0;
-  //PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o   =n*rhs._grid->_slice_stride[dimension];
-      //      int bo  =n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo++];
+  
+  if ( cbmask ==0x3 ) {
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	rhs._odata[so+o+b]=buffer[bo+b];
+      }
+    }
+  } else { 
+    int bo=0;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  rhs._odata[so+o+b]=buffer[bo++];
+	}
      }
    }
  }
@@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask ==0x3 ) {
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
-      int o      = n*rhs._grid->_slice_stride[dimension];
-      int offset = b+n*rhs._grid->_slice_block[dimension];
-      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
 	merge(rhs._odata[so+o+b],pointers,offset);
      }
    }
+  } else { 
+    assert(0); // think this is buggy FIXME
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers,offset);
+	}
+      }
+    }
  }
 }

@@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask == 0x3 ){
 PARALLEL_NESTED_LOOP2
-  for(int n=0;n<e1;n++){
-    for(int b=0;b<e2;b++){
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
 
-      int o =n*rhs._grid->_slice_stride[dimension]+b;
-      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
-      if ( ocb&cbmask ) {
-	//lhs._odata[lo+o]=rhs._odata[ro+o];
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
-
+    }
+  } else { 
+PARALLEL_NESTED_LOOP2
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+ 
+        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+        if ( ocb&cbmask ) {
+  	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	}
+      }
    }
  }
  
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -26,7 +26,7 @@ namespace Grid {
    // and Methods:
    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
-    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
+    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
    //
@@ -101,6 +101,7 @@ namespace Grid {
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
+    typedef typename Impl::StencilImpl              StencilImpl;	\
    typedef typename Impl::ImplParams ImplParams;

    ///////
@@ -112,7 +113,6 @@ namespace Grid {

      typedef ImplGauge<S,Nrepresentation> Gimpl;

-
      INHERIT_GIMPL_TYPES(Gimpl);

      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@@ -128,10 +128,11 @@ namespace Grid {

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
      ImplParams Params;
      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 

-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
        mult(&phi(),&U(mu),&chi());
      }

@@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;

      typedef GparityWilsonImplParams ImplParams;
      ImplParams Params;
      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      
+
      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
-      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){

 	typedef SiteHalfSpinor vobj;
 	typedef typename SiteHalfSpinor::scalar_object sobj;
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -109,7 +109,7 @@ namespace QCD {
  ///////////////////////////////////

  template<class Impl>
-  void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
+  void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
@@ -123,7 +123,7 @@ namespace QCD {
    FermionField Atilde(B._grid);
    Atilde = A;

-    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+    st.HaloExchange(B,comm_buf,compressor);
    
    for(int mu=0;mu<Nd;mu++){
      
@@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
    
    Compressor compressor(dag);
    
-    Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    Stencil.HaloExchange(in,comm_buf,compressor);
    
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
@@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP


  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) {

    assert((dag==DaggerNo) ||(dag==DaggerYes));

    Compressor compressor(dag);
-    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+    st.HaloExchange(in,comm_buf,compressor);
    
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -73,14 +73,14 @@ namespace Grid {
      ///////////////////////////////////////////////////////////////
      // Extra methods added by derived
      ///////////////////////////////////////////////////////////////
-      void DerivInternal(CartesianStencil & st,
+      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);

-      void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
+      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;


@@ -108,9 +108,9 @@ namespace Grid {
      GridBase                     *  _cbgrid;

      //Defines the stencils for even and odd
-      CartesianStencil Stencil; 
-      CartesianStencil StencilEven; 
-      CartesianStencil StencilOdd; 
+      StencilImpl Stencil; 
+      StencilImpl StencilEven; 
+      StencilImpl StencilOdd; 

      // Copy of the gauge field , with even and odd subsets
      DoubledGaugeField Umu;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -70,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO

  ImportGauge(_Umu);
+  commtime=0;
+  dslashtime=0;
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@@ -87,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

  Compressor compressor(DaggerNo);
-  Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+  Stencil.HaloExchange(in,comm_buf,compressor);
  
  int skip = (disp==1) ? 0 : 1;

@@ -107,7 +109,7 @@ PARALLEL_FOR_LOOP
 };

 template<class Impl>
-void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
+void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 					  DoubledGaugeField & U,
 					  GaugeField &mat,
 					  const FermionField &A,
@@ -124,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
  FermionField Btilde(B._grid);
  FermionField Atilde(B._grid);

-  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
+  st.HaloExchange(B,comm_buf,compressor);

  Atilde=A;

@@ -196,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 }

+
+template<class Impl>
+void WilsonFermion5D<Impl>::Report(void)
+{
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commstime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil merge  time "<<Stencil.mergetime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "Stencil buf    time "<<Stencil.buftime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "********************"<<std::endl;
+}
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@@ -214,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 }

 template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
+void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
@@ -229,13 +252,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  
-  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
+  commtime -=usecond();
+  st.HaloExchange(in,comm_buf,compressor);
+  commtime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
@@ -349,6 +375,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+  dslashtime +=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -32,7 +32,8 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
-
+     double commtime;
+     double dslashtime;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -73,14 +74,14 @@ namespace Grid {
      ///////////////////////////////////////////////////////////////
      // New methods added 
      ///////////////////////////////////////////////////////////////
-      void DerivInternal(CartesianStencil & st,
+      void DerivInternal(StencilImpl & st,
 			 DoubledGaugeField & U,
 			 GaugeField &mat,
 			 const FermionField &A,
 			 const FermionField &B,
 			 int dag);

-      void DhopInternal(CartesianStencil & st,
+      void DhopInternal(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
@@ -98,6 +99,7 @@ namespace Grid {
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);

+      void Report(void);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
@@ -113,9 +115,9 @@ namespace Grid {
      int Ls;

      //Defines the stencils for even and odd
-      CartesianStencil Stencil; 
-      CartesianStencil StencilEven; 
-      CartesianStencil StencilOdd; 
+      StencilImpl Stencil; 
+      StencilImpl StencilEven; 
+      StencilImpl StencilOdd; 

      // Copy of the gauge field , with even and odd subsets
      DoubledGaugeField Umu;
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -3,7 +3,7 @@ namespace Grid {
 namespace QCD {

 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
 };

 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					      int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
 }

 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
 {
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -17,15 +17,15 @@ namespace Grid {
     typedef FermionOperator<Impl> Base;
     
    public:
-     void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU,const FermionField &in, FermionField &out);
      
-     void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);

-     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #if defined(AVX512) || defined(IMCI)
@@ -41,23 +41,23 @@ namespace Grid {
 #endif
 #define HANDOPT
 #ifdef HANDOPT
-     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			       int sF,int sU,const FermionField &in, FermionField &out);

-     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out);
 #else

-     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			       int sF,int sU,const FermionField &in, FermionField &out)
     {
       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
     }

-     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out)
     {
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -75,7 +75,7 @@ namespace Grid {
 namespace QCD {

 template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 {
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -282,7 +282,7 @@ namespace QCD {

 #ifdef HANDOPT
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
 }

 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out)
 {
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -204,7 +204,7 @@ namespace Optimization {
 #if defined (AVX2)
      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
-      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
+      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
    }
@@ -248,8 +248,8 @@ namespace Optimization {
      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
 #if defined (AVX2)
-      __m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
-      __m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
+      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
+      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
--- a/lib/stencil/Stencil_common.cc
+++ b/lib/stencil/Stencil_common.cc
@@ -1,275 +1,6 @@
 #include "Grid.h"

 namespace Grid {
-
-
-  CartesianStencil::CartesianStencil(GridBase *grid,
-				     int npoints,
-				     int checkerboard,
-				     const std::vector<int> &directions,
-				     const std::vector<int> &distances) 
-    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
-    {
-      _npoints = npoints;
-      _grid    = grid;
-      _directions = directions;
-      _distances  = distances;
-      _unified_buffer_size=0;
-      _request_count =0;
-
-      int osites  = _grid->oSites();
-
-      for(int i=0;i<npoints;i++){
-
-	int point = i;
-
-	_entries[i].resize( osites);
-
-	int dimension    = directions[i];
-	int displacement = distances[i];
-	int shift = displacement;
-	
-	int fd = _grid->_fdimensions[dimension];
-	int rd = _grid->_rdimensions[dimension];
-	_permute_type[point]=_grid->PermuteType(dimension);
-
-	_checkerboard = checkerboard;
-
-	// the permute type
-	int simd_layout     = _grid->_simd_layout[dimension];
-	int comm_dim        = _grid->_processors[dimension] >1 ;
-	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
-
-	int sshift[2];
-	
-	// Underlying approach. For each local site build
-	// up a table containing the npoint "neighbours" and whether they 
-	// live in lattice or a comms buffer.
-	if ( !comm_dim ) {
-	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
-	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-
-	  if ( sshift[0] == sshift[1] ) {
-	    Local(point,dimension,shift,0x3);
-	  } else {
-	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
-	  }
-	} else { // All permute extract done in comms phase prior to Stencil application
-	  //        So tables are the same whether comm_dim or splice_dim
-	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
-	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
-	  if ( sshift[0] == sshift[1] ) {
-	    Comms(point,dimension,shift,0x3);
-	    //	    std::cout<<"Comms 0x3"<<std::endl;
-	  } else {
-	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
-	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
-	  }
-	}
-	//	for(int ss=0;ss<osites;ss++){
-	  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
-	  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
-	//	}
-      }
-    }
-
-
-    void CartesianStencil::Local     (int point, int dimension,int shiftpm,int cbmask)
-    {
-      int fd = _grid->_fdimensions[dimension];
-      int rd = _grid->_rdimensions[dimension];
-      int ld = _grid->_ldimensions[dimension];
-      int gd = _grid->_gdimensions[dimension];
-      
-      // Map to always positive shift modulo global full dimension.
-      int shift = (shiftpm+fd)%fd;
-      
-      // the permute type
-      int permute_dim =_grid->PermuteDim(dimension);
-      
-      for(int x=0;x<rd;x++){       
-	
-	int o   = 0;
-	int bo  = x * _grid->_ostride[dimension];
-	
-	int cb= (cbmask==0x2)? Odd : Even;
-	  
-	int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
-	int sx     = (x+sshift)%rd;
-
-	int wraparound=0;
-	if ( (shiftpm==-1) && (sx>x)  ) {
-	  wraparound = 1;
-	}
-	if ( (shiftpm== 1) && (sx<x)  ) {
-	  wraparound = 1;
-	}
-
-	  
-	int permute_slice=0;
-	if(permute_dim){
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-	}
-
-  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
-  
-      }
-    }
-
-    void CartesianStencil::Comms     (int point,int dimension,int shiftpm,int cbmask)
-    {
-      GridBase *grid=_grid;
-      
-      int fd              = _grid->_fdimensions[dimension];
-      int ld              = _grid->_ldimensions[dimension];
-      int rd              = _grid->_rdimensions[dimension];
-      int pd              = _grid->_processors[dimension];
-      int simd_layout     = _grid->_simd_layout[dimension];
-      int comm_dim        = _grid->_processors[dimension] >1 ;
-      
-      //      assert(simd_layout==1); // Why?
-      assert(comm_dim==1);
-      int shift = (shiftpm + fd) %fd;
-      assert(shift>=0);
-      assert(shift<fd);
-      
-      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
-      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
-                                           // send to one or more remote nodes.
-
-      int cb= (cbmask==0x2)? Odd : Even;
-      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
-      
-
-      for(int x=0;x<rd;x++){       
-
-	int sx        =  (x+sshift)%rd;
-	int comm_proc = ((x+sshift)/rd)%pd;
-    	int offnode = (comm_proc!= 0);
-
-	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
-	int wraparound=0;
-	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
-	  wraparound = 1;
-	}
-	if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
-	  wraparound = 1;
-	}
-	if (!offnode) {
-	  
-	  int permute_slice=0;
-	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
-	  
-	} else {
-	  
-	  int words = buffer_size;
-	  if (cbmask != 0x3) words=words>>1;
-	  
-	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
-	  
-	  int rank           = grid->_processor;
-	  int recv_from_rank;
-	  int xmit_to_rank;
-
-	  int unified_buffer_offset = _unified_buffer_size;
-	  _unified_buffer_size    += words;
-	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
-	  
-	}
-      }
-    }
-  // Routine builds up integer table for each site in _offsets, _is_local, _permute
-  void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
-    {
-      int rd = _grid->_rdimensions[dimension];
-      
-      if ( !_grid->CheckerBoarded(dimension) ) {
-	
-	int o   = 0;                                     // relative offset to base within plane
-	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
-	
-	// Simple block stride gather of SIMD objects
-	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	  for(int b=0;b<_grid->_slice_block[dimension];b++){
-	    _entries[point][lo+o+b]._offset  =ro+o+b;
-	    _entries[point][lo+o+b]._is_local=1;
-	    _entries[point][lo+o+b]._permute=permute;
-	    _entries[point][lo+o+b]._around_the_world=wrap;
-	  }
-	  o +=_grid->_slice_stride[dimension];
-	}
-	
-      } else {
-	
-	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
-	int o   = 0;                                     // relative offset to base within plane
-	
-	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	  for(int b=0;b<_grid->_slice_block[dimension];b++){
-	    
-	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
-	    
-	    if ( ocb&cbmask ) {
-	      _entries[point][lo+o+b]._offset =ro+o+b;
-	      _entries[point][lo+o+b]._is_local=1;
-	      _entries[point][lo+o+b]._permute=permute;
-	      _entries[point][lo+o+b]._around_the_world=wrap;
-	    }
-	    
-	    }
-	  o +=_grid->_slice_stride[dimension];
-	}
-	
-      }
-    }
-  // Routine builds up integer table for each site in _offsets, _is_local, _permute
-   void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
-    {
-      int rd = _grid->_rdimensions[dimension];
-      
-      if ( !_grid->CheckerBoarded(dimension) ) {
-	
-	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
-	int o   = 0;                                    // relative offset to base within plane
-	int bo  = 0;                                    // offset in buffer
-
-	// Simple block stride gather of SIMD objects
-	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	  for(int b=0;b<_grid->_slice_block[dimension];b++){
-	    _entries[point][so+o+b]._offset  =offset+(bo++);
-	    _entries[point][so+o+b]._is_local=0;
-	    _entries[point][so+o+b]._permute=0;
-	    _entries[point][so+o+b]._around_the_world=wrap;
-	  }
-	  o +=_grid->_slice_stride[dimension];
-	}
-
-      } else { 
-	
-	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
-	int o   = 0;                                      // relative offset to base within plane
-	int bo  = 0;                                      // offset in buffer
-
-	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
-	  for(int b=0;b<_grid->_slice_block[dimension];b++){
-
-	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
-	    if ( ocb & cbmask ) {
-	      _entries[point][so+o+b]._offset  =offset+(bo++);
-	      _entries[point][so+o+b]._is_local=0;
-	      _entries[point][so+o+b]._permute =0;
-	      _entries[point][so+o+b]._around_the_world=wrap;
-	    }
-	  }
-	  o +=_grid->_slice_stride[dimension];
-	}
-      }
-    }
 }
+
+
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
 template<class vobj> inline 
 void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
 {
-
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;

  const int words=sizeof(vobj)/sizeof(vector_type);
  const int Nsimd=vobj::vector_type::Nsimd();
+
  int Nextr=extracted.size();
  int s = Nsimd/Nextr;
+  scalar_type * vp = (scalar_type *)&vec;

-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nextr;i++) {
-    pointers[i] =(scalar_type *)& extracted[i][offset];
-  }
-
-  vector_type *vp = (vector_type *)&vec;
  for(int w=0;w<words;w++){
-    extract<vector_type,scalar_type>(&vp[w],pointers,w);
+    for(int i=0;i<Nextr;i++){
+      scalar_type * pointer = (scalar_type *)& extracted[i][offset];
+      pointer[w] = vp[i*s+w*Nsimd];
+    }
  }
 }

@@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  const int words=sizeof(vobj)/sizeof(vector_type);

  int Nextr=extracted.size();
+  int s=Nsimd/Nextr;

-  std::vector<scalar_type *> pointers(Nextr);
-  for(int i=0;i<Nextr;i++) 
-    pointers[i] =(scalar_type *)& extracted[i][offset];
-
-  vector_type *vp = (vector_type *)&vec;
+  scalar_type *pointer;
+  scalar_type *vp = (scalar_type *)&vec;

  for(int w=0;w<words;w++){
-    merge<vector_type,scalar_type>(&vp[w],pointers,w);
+    for(int i=0;i<Nextr;i++){
+      for(int ii=0;ii<s;ii++){
+	pointer=(scalar_type *)&extracted[i][offset];
+	vp[w*Nsimd+i*s+ii] = pointer[w];
+      }
+    }
  }
-}
+ }
 }
 #endif
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc