diff --git a/Makefile.am b/Makefile.am
index 049220e8..18b3ddc3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,8 +3,8 @@ SUBDIRS = lib benchmarks tests
 
 .PHONY: tests
 
-tests:
-	make -C tests tests
+tests: all
+	$(MAKE) -C tests tests
 
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
diff --git a/README b/README
deleted file mode 100644
index 17e92fa0..00000000
--- a/README
+++ /dev/null
@@ -1,44 +0,0 @@
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
-
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shapped arrays are called conformable arrays.
-
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
-
-The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes.
-
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
-
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
-
-These are presented as 
-
-  vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-  RealF, RealD, ComplexF, ComplexD
-
-MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is are examples:
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
-
-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-     
diff --git a/README b/README
new file mode 120000
index 00000000..42061c01
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+README.md
\ No newline at end of file
diff --git a/README.md b/README.md
index 49f08237..f4a376f1 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`.
 
 ### Possible communication interfaces
 
-The following options can be use with the `--enable-simd=` option to target different communication interfaces:
+The following options can be use with the `--enable-comms=` option to target different communication interfaces:
 
 | `<comm>`       | Description                                                   |
 | -------------- | ------------------------------------------------------------- |
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index de73bc81..969a2a42 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -193,6 +193,7 @@ int main (int argc, char ** argv)
     }
   }  
 
+
   Nloop=100;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
@@ -271,5 +272,90 @@ int main (int argc, char ** argv)
       std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
     }
   }    
+
+
+
+  Nloop=100;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+  for(int lat=4;lat<=maxlat;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+
+	ncomm=0;
+	for(int mu=0;mu<4;mu++){
+	
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu][0],
+					    recv_from_rank,
+					    bytes);
+	    //	    Grid.StencilSendToRecvFromComplete(requests);
+	    //	    requests.resize(0);
+
+	    comm_proc = mpi_layout[mu]-1;
+	  
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
+					    bytes);
+	    Grid.StencilSendToRecvFromComplete(requests);
+	    requests.resize(0);
+	  
+	  }
+	}
+	Grid.Barrier();
+
+      }
+      double stop=usecond();
+
+      double dbytes    = bytes;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      double time = stop-start; // microseconds
+
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+    }
+  }    
+
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 5d42a1f9..10e4521b 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)
 
   int ncall =100;
   if (1) {
-
+    FGrid->Barrier();
     Dw.ZeroCounters();
     double t0=usecond();
     for(int i=0;i<ncall;i++){
@@ -147,6 +147,7 @@ int main (int argc, char ** argv)
       __SSC_STOP;
     }
     double t1=usecond();
+    FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=1344*volume*ncall;
@@ -158,7 +159,7 @@ int main (int argc, char ** argv)
     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
     err = ref-result; 
     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    assert (norm2(err)< 1.0e-5 );
+    assert (norm2(err)< 1.0e-4 );
     Dw.Report();
   }
 
@@ -193,6 +194,7 @@ int main (int argc, char ** argv)
       pokeSite(tmp,ssrc,site);
     }}}}}
     std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
+    FGrid->Barrier();
     double t0=usecond();
     sDw.ZeroCounters();
     for(int i=0;i<ncall;i++){
@@ -201,6 +203,7 @@ int main (int argc, char ** argv)
       __SSC_STOP;
     }
     double t1=usecond();
+    FGrid->Barrier();
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=1344*volume*ncall;
 
@@ -211,12 +214,12 @@ int main (int argc, char ** argv)
   
     if(0){
       for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-  sDw.Dhop(ssrc,sresult,0);
-  PerformanceCounter Counter(i);
-  Counter.Start();
-  sDw.Dhop(ssrc,sresult,0);
-  Counter.Stop();
-  Counter.Report();
+	sDw.Dhop(ssrc,sresult,0);
+	PerformanceCounter Counter(i);
+	Counter.Start();
+	sDw.Dhop(ssrc,sresult,0);
+	Counter.Stop();
+	Counter.Report();
       }
     }
 
@@ -240,7 +243,7 @@ int main (int argc, char ** argv)
       }
     }}}}}
     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
-    assert (sum< 1.0e-5 );
+    assert (sum< 1.0e-4 );
 
 
     if (1) {
@@ -271,6 +274,7 @@ int main (int argc, char ** argv)
       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
       std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 
+      FGrid->Barrier();
       sDw.ZeroCounters();
       sDw.stat.init("DhopEO");
       double t0=usecond();
@@ -278,6 +282,7 @@ int main (int argc, char ** argv)
         sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
       }
       double t1=usecond();
+      FGrid->Barrier();
       sDw.stat.print();
 
       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
@@ -301,7 +306,7 @@ int main (int argc, char ** argv)
 
       error+= norm2(ssrc_o);
       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
-      if(error>1.0e-5) { 
+      if(error>1.0e-4) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
@@ -337,7 +342,7 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
   err = ref-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);
   LatticeFermion src_e (FrbGrid);
   LatticeFermion src_o (FrbGrid);
   LatticeFermion r_e   (FrbGrid);
@@ -363,11 +368,13 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
   {
     Dw.ZeroCounters();
+    FGrid->Barrier();
     double t0=usecond();
     for(int i=0;i<ncall;i++){
       Dw.DhopEO(src_o,r_e,DaggerNo);
     }
     double t1=usecond();
+    FGrid->Barrier();
     
     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
     double flops=(1344.0*volume*ncall)/2;
@@ -389,14 +396,14 @@ int main (int argc, char ** argv)
 
   err = r_eo-result; 
   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);
 
   pickCheckerboard(Even,src_e,err);
   pickCheckerboard(Odd,src_o,err);
   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  assert(norm2(src_e)<1.0e-5);
-  assert(norm2(src_o)<1.0e-5);
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
 
   Grid_finalize();
 }
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index a3803b87..21db823d 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -69,8 +69,8 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 
-  int Lmax=32;
-  int dmin=0;
+  int Lmax=16;
+  int dmin=2;
   if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
   if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
   for (int L=8;L<=Lmax;L*=2){
diff --git a/lib/FFT.h b/lib/FFT.h
index fda43eb8..b5b31d82 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -29,9 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 
-#ifdef HAVE_FFTW	
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include <fftw/fftw3.h>
+#else
 #include <fftw3.h>
 #endif
+#endif
 
 
 namespace Grid {
@@ -122,7 +126,8 @@ namespace Grid {
     
     double Flops(void) {return flops;}
     double MFlops(void) {return flops/usec;}
-    
+    double USec(void)   {return (double)usec;}    
+
     FFT ( GridCartesian * grid ) :
     vgrid(grid),
     Nd(grid->_ndimension),
diff --git a/lib/Init.cc b/lib/Init.cc
index b5ef0a95..d6d6b9f8 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -369,7 +369,7 @@ void Grid_init(int *argc,char ***argv)
   
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
   MPI_Finalize();
   Grid_unquiesce_nodes();
 #endif
diff --git a/lib/Log.cc b/lib/Log.cc
index 733bfc58..7521657b 100644
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -93,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
   int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
   MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
diff --git a/lib/PerfCount.h b/lib/PerfCount.h
index 9ac58883..5ab07c02 100644
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif
 
 namespace Grid {
 
@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
    return tmp;
 }
 #elif defined __x86_64__
-#include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
   return __rdtsc();
   //  unsigned int dummy;
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index bd90b864..1f9b9b58 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -191,7 +191,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
                   << LinalgTimer.Elapsed();
         std::cout << std::endl;
 
-        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 
        	if (!CGState.do_repro && ReproTest){
        		CGState.do_repro = true;
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index 0f43f1f5..5e91b305 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   source =0;
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 74862400..d7a9edd3 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -10,6 +10,7 @@
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 }
 
 #if defined(AVX512) 
-    
+#include <simd/Intel512wilson.h>
+
     ///////////////////////////////////////////////////////////
     // If we are AVX512 specialise the single precision routine
     ///////////////////////////////////////////////////////////
-    
-#include <simd/Intel512wilson.h>
+
 #include <simd/Intel512single.h>
     
-static Vector<vComplexF> signs;
-    
-  int setupSigns(void ){
-    Vector<vComplexF> bother(2);
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
     signs = bother;
     vrsign(signs[0]);
     visign(signs[1]);
     return 1;
   }
-  static int signInit = setupSigns();
+
+  static int signInitF = setupSigns(signsF);
   
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexF
+#define signs signsF
   
 #undef KERNEL_DAG
 template<> void 
@@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
@@ -113,8 +118,71 @@ template<> void
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+	
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+#define signs signsD
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexD
+  
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
-#endif
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
+#define MAYBEPERM(A,B) 
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+	
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+
+#endif //AVX512
 
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index 12579d8c..72e13754 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -5,7 +5,9 @@
   const uint64_t plocal =(uint64_t) & in._odata[0];
 
   //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
+  //on the chosen precision
+  COMPLEX_TYPE *isigns = &signs[0];
 
   MASK_REGS;
   int nmax=U._grid->oSites();
diff --git a/lib/qcd/hmc/HmcRunner.h b/lib/qcd/hmc/HmcRunner.h
index a31ba784..53b127cf 100644
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate {
     NoSmearing<Gimpl> SmearingPolicy;
     typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy >
         IntegratorType;  // change here to change the algorithm
-    IntegratorParameters MDpar(20, 1.0);
+    IntegratorParameters MDpar(40, 1.0);
     IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);
 
     // Checkpoint strategy
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index 4f2779a8..93e4c9ad 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -382,7 +382,6 @@ namespace Optimization {
   // Some Template specialization
 
   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
   //Complex float Reduce
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c98bc2d0..2e7c1f0a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -9,4 +9,4 @@ endif
 include Make.inc
 
 subtests:
-	for d in $(SUBDIRS); do make -C $${d} tests; done
+	for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done
diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc
index 6a2868b0..c6b77a13 100644
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
   static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
     for(int mu=0;mu<Nd;mu++){
 //      ImplComplex cmi(0.0,-1.0);
-      ComplexD cmi(0.0,-1.0);
+      Complex cmi(0.0,-1.0);
       A[mu] = Ta(U[mu]) * cmi;
     }
   }
@@ -52,13 +52,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
     }
   }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,RealD & alpha,int maxiter,RealD Omega_tol, RealD Phi_tol) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol) {
     GridBase *grid = Umu._grid;
 
-    RealD org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-    RealD org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-    RealD old_trace = org_link_trace;
-    RealD trG;
+    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    Real old_trace = org_link_trace;
+    Real trG;
 
     std::vector<GaugeMat> U(Nd,grid);
                  GaugeMat dmuAmu(grid);
@@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       // Monitor progress and convergence test 
       // infrequently to minimise cost overhead
       if ( i %20 == 0 ) { 
-	RealD plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-	RealD link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
 
 	std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	
-	RealD Phi  = 1.0 - old_trace / link_trace ;
-	RealD Omega= 1.0 - trG;
+	Real Phi  = 1.0 - old_trace / link_trace ;
+	Real Omega= 1.0 - trG;
 
 
 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
@@ -91,7 +91,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
       }
     }
   };
-  static RealD SteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
     GridBase *grid = U[0]._grid;
 
     std::vector<GaugeMat> A(Nd,grid);
@@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
 
 
-    RealD vol = grid->gSites();
-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real vol = grid->gSites();
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
 
     SU<Nc>::GaugeTransform(U,g);
 
     return trG;
   }
 
-  static RealD FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
 
     GridBase *grid = U[0]._grid;
 
-    RealD vol = grid->gSites();
+    Real vol = grid->gSites();
 
     FFT theFFT((GridCartesian *)grid);
 
     LatticeComplex  Fp(grid);
     LatticeComplex  psq(grid); psq=zero;
     LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = ComplexD(1.0,0.0);
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);
 
     GaugeMat g(grid);
     GaugeMat dmuAmu_p(grid);
@@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     std::vector<int> coor(grid->_ndimension,0);
     for(int mu=0;mu<Nd;mu++) {
 
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
     }
 
-    ComplexD psqMax(16.0);
+    Complex psqMax(16.0);
     Fp =  psqMax*one/psq;
 
     static int once;
@@ -160,20 +160,20 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
 
     GaugeMat ciadmam(grid);
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
     ciadmam = dmuAmu*cialpha;
     SU<Nc>::taExp(ciadmam,g);
 
-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
 
     SU<Nc>::GaugeTransform(U,g);
 
     return trG;
   }
 
-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
     GridBase *grid = g._grid;
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
     GaugeMat ciadmam(grid);
     DmuAmu(A,dmuAmu);
     ciadmam = dmuAmu*cialpha;
@@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
     ComplexField  pha(grid);
     GaugeMat      Apha(grid);
 
-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);
 
     for(int mu=0;mu<Nd;mu++){
 
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       pha = exp(pmu *  (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
@@ -213,14 +213,14 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 
     ComplexField  pmu(grid);
     ComplexField  pha(grid);
-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);
     
     // Sign convention for FFTW calls:
     // A(x)= Sum_p e^ipx A(p) / V
     // A(p)= Sum_p e^-ipx A(x)
 
     for(int mu=0;mu<Nd;mu++){
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
       LatticeCoordinate(pmu,mu);
       pmu = TwoPiL * pmu ;
       pha = exp(-pmu *  (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
@@ -241,7 +241,7 @@ int main (int argc, char ** argv)
   int threads = GridThread::GetThreads();
 
   std::vector<int> latt_size   = GridDefaultLatt();
-  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
   int vol = 1;
@@ -261,25 +261,25 @@ int main (int argc, char ** argv)
   std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge    *" <<std::endl;
   std::cout<< "*****************************************************************" <<std::endl;
 
-  LatticeGaugeFieldD   Umu(&GRID);
-  LatticeGaugeFieldD   Uorg(&GRID);
-  LatticeColourMatrixD   g(&GRID); // Gauge xform
+  LatticeGaugeField   Umu(&GRID);
+  LatticeGaugeField   Uorg(&GRID);
+  LatticeColourMatrix   g(&GRID); // Gauge xform
 
   
   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
   Uorg=Umu;
 
   SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
-  RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Initial plaquette "<<plaq << std::endl;
 
 
 
-  RealD alpha=0.1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
+  Real alpha=0.1;
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
 
 
-  plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
   std::cout << " Final plaquette "<<plaq << std::endl;
 
   Uorg = Uorg - Umu;
diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc
index 54b1ddb3..4eb4398d 100644
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
 
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
 
   SpinMatrixF Sp; 
   Sp = zero; Sp = Sp+cVol;
diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge.cc b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
index 9dcf6343..351d1e68 100644
--- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
@@ -68,7 +68,7 @@ class HmcRunner : public NerscHmcRunner {
     TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
 
     // Set smearing (true/false), default: false
-    Nf2.is_smeared = true;
+    Nf2.is_smeared = false;
 
     // Collect actions
     ActionLevel<LatticeGaugeField> Level1(1);