Merge branch 'develop' into feature/CG_repro

2026-08-02 00:43:28 +01:00 · 2016-11-09 14:44:46 +00:00
parent 9720c9ba3f 58f4950652
commit 53f240200e
20 changed files with 251 additions and 125 deletions
@@ -3,8 +3,8 @@ SUBDIRS = lib benchmarks tests

 .PHONY: tests

-tests:
-	make -C tests tests
+tests: all
+	$(MAKE) -C tests tests

 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
@@ -1,44 +0,0 @@
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
-
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shapped arrays are called conformable arrays.
-
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
-
-The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes.
-
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
-
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
-
-These are presented as 
-
-  vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-  RealF, RealD, ComplexF, ComplexD
-
-MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is are examples:
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
-
-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
-
-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-     
@@ -0,0 +1 @@
+README.md
@@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`.

 ### Possible communication interfaces

-The following options can be use with the `--enable-simd=` option to target different communication interfaces:
+The following options can be use with the `--enable-comms=` option to target different communication interfaces:

 | `<comm>`       | Description                                                   |
 | -------------- | ------------------------------------------------------------- |
@@ -193,6 +193,7 @@ int main (int argc, char ** argv)
    }
  }  

+
  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
@@ -271,5 +272,90 @@ int main (int argc, char ** argv)
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
+
+
+
+  Nloop=100;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+
+  for(int lat=4;lat<=maxlat;lat+=2){
+    for(int Ls=1;Ls<=16;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+
+	ncomm=0;
+	for(int mu=0;mu<4;mu++){
+	
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu][0],
+					    recv_from_rank,
+					    bytes);
+	    //	    Grid.StencilSendToRecvFromComplete(requests);
+	    //	    requests.resize(0);
+
+	    comm_proc = mpi_layout[mu]-1;
+	  
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
+					    bytes);
+	    Grid.StencilSendToRecvFromComplete(requests);
+	    requests.resize(0);
+	  
+	  }
+	}
+	Grid.Barrier();
+
+      }
+      double stop=usecond();
+
+      double dbytes    = bytes;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      double time = stop-start; // microseconds
+
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+    }
+  }    
+
  Grid_finalize();
 }
@@ -57,7 +57,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -138,7 +138,7 @@ int main (int argc, char ** argv)

  int ncall =100;
  if (1) {
-
+    FGrid->Barrier();
    Dw.ZeroCounters();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
@@ -147,6 +147,7 @@ int main (int argc, char ** argv)
      __SSC_STOP;
    }
    double t1=usecond();
+    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
@@ -158,7 +159,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    assert (norm2(err)< 1.0e-5 );
+    assert (norm2(err)< 1.0e-4 );
    Dw.Report();
  }

@@ -193,6 +194,7 @@ int main (int argc, char ** argv)
      pokeSite(tmp,ssrc,site);
    }}}}}
    std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
+    FGrid->Barrier();
    double t0=usecond();
    sDw.ZeroCounters();
    for(int i=0;i<ncall;i++){
@@ -201,6 +203,7 @@ int main (int argc, char ** argv)
      __SSC_STOP;
    }
    double t1=usecond();
+    FGrid->Barrier();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;

@@ -211,12 +214,12 @@ int main (int argc, char ** argv)
  
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-  sDw.Dhop(ssrc,sresult,0);
-  PerformanceCounter Counter(i);
-  Counter.Start();
-  sDw.Dhop(ssrc,sresult,0);
-  Counter.Stop();
-  Counter.Report();
+	sDw.Dhop(ssrc,sresult,0);
+	PerformanceCounter Counter(i);
+	Counter.Start();
+	sDw.Dhop(ssrc,sresult,0);
+	Counter.Stop();
+	Counter.Report();
      }
    }

@@ -240,7 +243,7 @@ int main (int argc, char ** argv)
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
-    assert (sum< 1.0e-5 );
+    assert (sum< 1.0e-4 );


    if (1) {
@@ -271,6 +274,7 @@ int main (int argc, char ** argv)
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;

+      FGrid->Barrier();
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
@@ -278,6 +282,7 @@ int main (int argc, char ** argv)
        sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
      }
      double t1=usecond();
+      FGrid->Barrier();
      sDw.stat.print();

      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
@@ -301,7 +306,7 @@ int main (int argc, char ** argv)

      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
-      if(error>1.0e-5) { 
+      if(error>1.0e-4) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
@@ -337,7 +342,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@@ -363,11 +368,13 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    Dw.ZeroCounters();
+    FGrid->Barrier();
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.DhopEO(src_o,r_e,DaggerNo);
    }
    double t1=usecond();
+    FGrid->Barrier();
    
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=(1344.0*volume*ncall)/2;
@@ -389,14 +396,14 @@ int main (int argc, char ** argv)

  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  assert(norm2(err)<1.0e-5);
+  assert(norm2(err)<1.0e-4);

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  assert(norm2(src_e)<1.0e-5);
-  assert(norm2(src_o)<1.0e-5);
+  assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);

  Grid_finalize();
 }
@@ -69,8 +69,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;

-  int Lmax=32;
-  int dmin=0;
+  int Lmax=16;
+  int dmin=2;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8;L<=Lmax;L*=2){
@@ -29,9 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_

-#ifdef HAVE_FFTW	
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include <fftw/fftw3.h>
+#else
 #include <fftw3.h>
 #endif
+#endif


 namespace Grid {
@@ -122,7 +126,8 @@ namespace Grid {
    
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
-    
+    double USec(void)   {return (double)usec;}    
+
    FFT ( GridCartesian * grid ) :
    vgrid(grid),
    Nd(grid->_ndimension),
@@ -369,7 +369,7 @@ void Grid_init(int *argc,char ***argv)
  
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) 
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
@@ -93,7 +93,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif

 namespace Grid {

@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
-#include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
@@ -191,7 +191,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
                  << LinalgTimer.Elapsed();
        std::cout << std::endl;

-        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

       	if (!CGState.do_repro && ReproTest){
       		CGState.do_repro = true;
@@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
@@ -10,6 +10,7 @@

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 }

 #if defined(AVX512) 
-    
+#include <simd/Intel512wilson.h>
+
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
-    
-#include <simd/Intel512wilson.h>
+
 #include <simd/Intel512single.h>
    
-static Vector<vComplexF> signs;
-    
-  int setupSigns(void ){
-    Vector<vComplexF> bother(2);
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
    return 1;
  }
-  static int signInit = setupSigns();
+
+  static int signInitF = setupSigns(signsF);
  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexF
+#define signs signsF
  
 #undef KERNEL_DAG
 template<> void 
@@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
@@ -113,8 +118,71 @@ template<> void
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+	
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+#define signs signsD
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexD
+  
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
-#endif
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
+#define MAYBEPERM(A,B) 
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+	
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+
+#endif //AVX512

 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
@@ -5,7 +5,9 @@
  const uint64_t plocal =(uint64_t) & in._odata[0];

  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
+  //on the chosen precision
+  COMPLEX_TYPE *isigns = &signs[0];

  MASK_REGS;
  int nmax=U._grid->oSites();
@@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate {
    NoSmearing<Gimpl> SmearingPolicy;
    typedef MinimumNorm2<GaugeField, NoSmearing<Gimpl>, RepresentationsPolicy >
        IntegratorType;  // change here to change the algorithm
-    IntegratorParameters MDpar(20, 1.0);
+    IntegratorParameters MDpar(40, 1.0);
    IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy);

    // Checkpoint strategy
@@ -382,7 +382,6 @@ namespace Optimization {
  // Some Template specialization

  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
  //Complex float Reduce
@@ -9,4 +9,4 @@ endif
 include Make.inc

 subtests:
-	for d in $(SUBDIRS); do make -C $${d} tests; done
+	for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done
@@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
    for(int mu=0;mu<Nd;mu++){
 //      ImplComplex cmi(0.0,-1.0);
-      ComplexD cmi(0.0,-1.0);
+      Complex cmi(0.0,-1.0);
      A[mu] = Ta(U[mu]) * cmi;
    }
  }
@@ -52,13 +52,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
      dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
    }
  }  
-  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,RealD & alpha,int maxiter,RealD Omega_tol, RealD Phi_tol) {
+  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol) {
    GridBase *grid = Umu._grid;

-    RealD org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-    RealD org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
-    RealD old_trace = org_link_trace;
-    RealD trG;
+    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+    Real old_trace = org_link_trace;
+    Real trG;

    std::vector<GaugeMat> U(Nd,grid);
                 GaugeMat dmuAmu(grid);
@@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
      // Monitor progress and convergence test 
      // infrequently to minimise cost overhead
      if ( i %20 == 0 ) { 
-	RealD plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
-	RealD link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
+	Real plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
+	Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 

 	std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
 	
-	RealD Phi  = 1.0 - old_trace / link_trace ;
-	RealD Omega= 1.0 - trG;
+	Real Phi  = 1.0 - old_trace / link_trace ;
+	Real Omega= 1.0 - trG;


 	std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
@@ -91,7 +91,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
      }
    }
  };
-  static RealD SteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
    GridBase *grid = U[0]._grid;

    std::vector<GaugeMat> A(Nd,grid);
@@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);


-    RealD vol = grid->gSites();
-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real vol = grid->gSites();
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

    SU<Nc>::GaugeTransform(U,g);

    return trG;
  }

-  static RealD FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,RealD & alpha, GaugeMat & dmuAmu) {
+  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {

    GridBase *grid = U[0]._grid;

-    RealD vol = grid->gSites();
+    Real vol = grid->gSites();

    FFT theFFT((GridCartesian *)grid);

    LatticeComplex  Fp(grid);
    LatticeComplex  psq(grid); psq=zero;
    LatticeComplex  pmu(grid); 
-    LatticeComplex   one(grid); one = ComplexD(1.0,0.0);
+    LatticeComplex   one(grid); one = Complex(1.0,0.0);

    GaugeMat g(grid);
    GaugeMat dmuAmu_p(grid);
@@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    std::vector<int> coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {

-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
      LatticeCoordinate(pmu,mu);
      pmu = TwoPiL * pmu ;
      psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5); 
    }

-    ComplexD psqMax(16.0);
+    Complex psqMax(16.0);
    Fp =  psqMax*one/psq;

    static int once;
@@ -160,20 +160,20 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);

    GaugeMat ciadmam(grid);
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
    ciadmam = dmuAmu*cialpha;
    SU<Nc>::taExp(ciadmam,g);

-    RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
+    Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;

    SU<Nc>::GaugeTransform(U,g);

    return trG;
  }

-  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) {
+  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
    GridBase *grid = g._grid;
-    ComplexD cialpha(0.0,-alpha);
+    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
    DmuAmu(A,dmuAmu);
    ciadmam = dmuAmu*cialpha;
@@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    ComplexField  pha(grid);
    GaugeMat      Apha(grid);

-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);

    for(int mu=0;mu<Nd;mu++){

-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
      LatticeCoordinate(pmu,mu);
      pmu = TwoPiL * pmu ;
      pha = exp(pmu *  (0.5 *ci)); // e(ipmu/2) since Amu(x+mu/2)
@@ -213,14 +213,14 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

    ComplexField  pmu(grid);
    ComplexField  pha(grid);
-    ComplexD ci(0.0,1.0);
+    Complex ci(0.0,1.0);
    
    // Sign convention for FFTW calls:
    // A(x)= Sum_p e^ipx A(p) / V
    // A(p)= Sum_p e^-ipx A(x)

    for(int mu=0;mu<Nd;mu++){
-      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+      Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
      LatticeCoordinate(pmu,mu);
      pmu = TwoPiL * pmu ;
      pha = exp(-pmu *  (0.5 *ci)); // e(+ipmu/2) since Amu(x+mu/2)
@@ -241,7 +241,7 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();

  std::vector<int> latt_size   = GridDefaultLatt();
-  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();

  int vol = 1;
@@ -261,25 +261,25 @@ int main (int argc, char ** argv)
  std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge    *" <<std::endl;
  std::cout<< "*****************************************************************" <<std::endl;

-  LatticeGaugeFieldD   Umu(&GRID);
-  LatticeGaugeFieldD   Uorg(&GRID);
-  LatticeColourMatrixD   g(&GRID); // Gauge xform
+  LatticeGaugeField   Umu(&GRID);
+  LatticeGaugeField   Uorg(&GRID);
+  LatticeColourMatrix   g(&GRID); // Gauge xform

  
  SU3::ColdConfiguration(pRNG,Umu); // Unit gauge
  Uorg=Umu;

  SU3::RandomGaugeTransform(pRNG,Umu,g); // Unit gauge
-  RealD plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Initial plaquette "<<plaq << std::endl;



-  RealD alpha=0.1;
-  FourierAcceleratedGaugeFixer<PeriodicGimplD>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);
+  Real alpha=0.1;
+  FourierAcceleratedGaugeFixer<PeriodicGimplR>::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10);


-  plaq=WilsonLoops<PeriodicGimplD>::avgPlaquette(Umu);
+  plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
  std::cout << " Final plaquette "<<plaq << std::endl;

  Uorg = Uorg - Umu;
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;

-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;

  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;
@@ -68,7 +68,7 @@ class HmcRunner : public NerscHmcRunner {
    TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);

    // Set smearing (true/false), default: false
-    Nf2.is_smeared = true;
+    Nf2.is_smeared = false;

    // Collect actions
    ActionLevel<LatticeGaugeField> Level1(1);