Tidy up of mpi3; also some cleaning of the dslash controls.

2026-08-01 16:33:28 +01:00 · 2016-11-02 08:07:09 +00:00
parent 791cb050c8
commit bb94ddd0eb
13 changed files with 321 additions and 632 deletions
@@ -48,8 +48,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
-  for(int lat=4;lat<=32;lat+=2){
+  int maxlat=32;
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
@@ -124,7 +124,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;


-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -199,7 +199,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;

-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
@@ -271,131 +271,5 @@ int main (int argc, char ** argv)
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
-
-
-
-#if 0
-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
-
-  for(int lat=4;lat<=32;lat+=2){
-    for(int Ls=1;Ls<=16;Ls*=2){
-
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-
-
-      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-
-
-      std::vector<CartesianCommunicator::CommsRequest_t> empty;
-      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
-      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
-
-      for(int mu=0;mu<4;mu++){
-	ncomm=0;
-	if (mpi_layout[mu]>1 ) {
-	  ncomm++;
-
-	  int comm_proc;
-	  int xmit_to_rank;
-	  int recv_from_rank;
-
-	  comm_proc=1;
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	  Grid.SendToRecvFromInit(requests_fwd[mu],
-				  (void *)&xbuf[mu][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu][0],
-				  recv_from_rank,
-				  bytes);
-
-	  comm_proc = mpi_layout[mu]-1;
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	  Grid.SendToRecvFromInit(requests_bwd[mu],
-				  (void *)&xbuf[mu+4][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu+4][0],
-				  recv_from_rank,
-				  bytes);
-
-	}
-      }
-
-      {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
-	  
-	  for(int mu=0;mu<4;mu++){
-	    
-	    if (mpi_layout[mu]>1 ) {
-	      
-	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
-	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
-	    }
-	  }
-	  Grid.Barrier();
-	}
-	
-	double stop=usecond();
-	
-	double dbytes    = bytes;
-	double xbytes    = Nloop*dbytes*2.0*ncomm;
-	double rbytes    = xbytes;
-	double bidibytes = xbytes+rbytes;
-	
-	double time = stop-start;
-	
-	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
-
-      }
-
-
-      {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
-	  
-	  for(int mu=0;mu<4;mu++){
-	    
-	    if (mpi_layout[mu]>1 ) {
-	      
-	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
-	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
-	    }
-	  }
-	  Grid.Barrier();
-	}
-	
-	double stop=usecond();
-	
-	double dbytes    = bytes;
-	double xbytes    = Nloop*dbytes*2.0*ncomm;
-	double rbytes    = xbytes;
-	double bidibytes = xbytes+rbytes;
-	
-	double time = stop-start;
-	
-	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
-
-      }
-
-    }
-  }
-
-#endif
-
  Grid_finalize();
 }
@@ -44,7 +44,6 @@ struct scal {
    Gamma::GammaT
  };

-bool overlapComms = false;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
@@ -54,10 +53,6 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
-    overlapComms = true;
-  }
-
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

@@ -126,14 +121,21 @@ int main (int argc, char ** argv)

  RealD NP = UGrid->_Nprocessors;

-  for(int doasm=1;doasm<2;doasm++){
-
-    QCD::WilsonKernelsStatic::AsmOpt=doasm;
-
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  
-  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
-  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  int ncall =100;
  if (1) {

@@ -162,6 +164,17 @@ int main (int argc, char ** argv)

  if (1)
  {
+
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+    std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
+    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
+    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
@@ -248,6 +261,16 @@ int main (int argc, char ** argv)
      sr_e = zero;
      sr_o = zero;

+      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
+      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
+      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
@@ -308,7 +331,7 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
@@ -322,13 +345,22 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);


-  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);

  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;

+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    Dw.ZeroCounters();
    double t0=usecond();
@@ -366,8 +398,5 @@ int main (int argc, char ** argv)
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);

-
-  }
-
  Grid_finalize();
 }
@@ -1,153 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./benchmarks/Benchmark_dwf.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-using namespace Grid::QCD;
-
-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::GammaMatrix Gmu [] = {
-    Gamma::GammaX,
-    Gamma::GammaY,
-    Gamma::GammaZ,
-    Gamma::GammaT
-  };
-
-bool overlapComms = false;
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
-    overlapComms = true;
-  }
-
-  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-
-  LatticeFermion src   (FGrid); random(RNG5,src);
-  LatticeFermion result(FGrid); result=zero;
-  LatticeFermion    ref(FGrid);    ref=zero;
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
-
-  ColourMatrix cm = Complex(1.0,0.0);
-
-  LatticeGaugeField Umu(UGrid); 
-  random(RNG4,Umu);
-
-  LatticeGaugeField Umu5d(FGrid); 
-
-  // replicate across fifth dimension
-  for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
-    }
-  }
-
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  std::vector<LatticeColourMatrix> U(4,FGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
-  }
-
-  if (1)
-  {
-    ref = zero;
-    for(int mu=0;mu<Nd;mu++){
-
-      tmp = U[mu]*Cshift(src,mu+1,1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-
-  typename DomainWallFermionR::ImplParams params; 
-  params.overlapCommsCompute = overlapComms;
-  
-  RealD NP = UGrid->_Nprocessors;
-
-
-  QCD::WilsonKernelsStatic::AsmOpt=1;
-
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
-  
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =50;
-  if (1) {
-
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    double t1=usecond();
-    
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
-
-    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
-    err = ref-result; 
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //    Dw.Report();
-  }
-  Grid_finalize();
-}
@@ -51,16 +51,18 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  if ( getenv("ASMOPT") )  {
-    QCD::WilsonKernelsStatic::AsmOpt=1;
-  } else { 
-    QCD::WilsonKernelsStatic::AsmOpt=0;
-  }
-
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
@@ -58,6 +58,19 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;

+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
@@ -1,175 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_zmm.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-
-using namespace Grid;
-using namespace Grid::QCD;
-
-
-int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
-
-int main(int argc,char **argv)
-{
-  Grid_init(&argc,&argv);
-  std::ofstream os("zmm.dat");
-
-  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  for(int L=4;L<=32;L+=4){
-    for(int m=1;m<=2;m++){
-      for(int Ls=8;Ls<=16;Ls+=8){
-	std::vector<int> grid({L,L,m*L,m*L});
-  std::cout << GridLogMessage <<"\t";
-	for(int i=0;i<4;i++) { 
-	  std::cout << grid[i]<<"x";
-	}
-	std::cout << Ls<<"\t\t";
-	bench(os,grid,Ls);
-      }
-    }
-  }
-}
-
-int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
-{
-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
-  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
-
-  LatticeFermion src (FGrid);
-  LatticeFermion tmp (FGrid);
-  LatticeFermion srce(FrbGrid);
-
-  LatticeFermion resulto(FrbGrid); resulto=zero;
-  LatticeFermion resulta(FrbGrid); resulta=zero;
-  LatticeFermion junk(FrbGrid); junk=zero;
-  LatticeFermion diff(FrbGrid); 
-  LatticeGaugeField Umu(UGrid);
-
-  double mfc, mfa, mfo, mfl1;
-
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  random(RNG5,src);
-#if 1
-  random(RNG4,Umu);
-#else
-  int mmu=2;
-  std::vector<LatticeColourMatrix> U(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-    if ( mu!=mmu ) U[mu] = zero;
-    if ( mu==mmu ) U[mu] = 1.0;
-    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
-  }
-#endif
- pickCheckerboard(Even,srce,src);
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-  int ncall=50;
-  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.DhopOE(srce,resulto,0);
-  }
-  double t1=usecond();
-
-  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-  double flops=1344*volume/2;
-
-  mfc = flops*ncall/(t1-t0);
-  std::cout<<mfc<<"\t\t";
-
-  QCD::WilsonKernelsStatic::AsmOpt=1;
-  t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.DhopOE(srce,resulta,0);
-  }
-  t1=usecond();
-  mfa = flops*ncall/(t1-t0);
-  std::cout<<mfa<<"\t\t";
-  /*
-  int dag=DaggerNo;
-  t0=usecond();
-  for(int i=0;i<1;i++){
-    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
-  }
-  t1=usecond();
-  mfo = flops*100/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
-
-  t0=usecond();
-  for(int i=0;i<1;i++){
-    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
-  }
-  t1=usecond();
-  mfl1= flops*100/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
-  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
-     << mfc<<" "
-     << mfa<<" "
-     << mfo<<" "
-     << mfl1<<std::endl;
-  */
-
-#if 0
-  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-    Dw.DhopOE(srce,resulta,0);
-    PerformanceCounter Counter(i);
-    Counter.Start();
-    Dw.DhopOE(srce,resulta,0);
-    Counter.Stop();
-    Counter.Report();
-  }
-#endif
-  //resulta = (-0.5) * resulta;
-
-  diff = resulto-resulta;
-  std::cout<<norm2(diff)<<std::endl;
-  return 0;
-}
-
-