diff --git a/Grid/qcd/utils/CovariantLaplacian.h b/Grid/qcd/utils/CovariantLaplacian.h
index d6323ed2..9e1204d4 100644
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@@ -109,8 +109,7 @@ public:
   };
   
   virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &_in, Field &_out)
+  virtual void  Morig(const Field &_in, Field &_out)
   {
     ///////////////////////////////////////////////
     // Halo exchange for this geometry of stencil
@@ -120,7 +119,8 @@ public:
     ///////////////////////////////////
     // Arithmetic expressions
     ///////////////////////////////////
-    auto st = Stencil.View(AcceleratorRead);
+//    auto st = Stencil.View(AcceleratorRead);
+    autoView( st     , Stencil    , AcceleratorRead);
     auto buf = st.CommBuf();
 
     autoView( in     , _in    , AcceleratorRead);
@@ -172,7 +172,165 @@ public:
 
 	coalescedWrite(out[ss], res,lane);
     });
+
   };
+  virtual void  Mnew (const Field &_in, Field &_out)
+  {
+    ///////////////////////////////////////////////
+    // Halo exchange for this geometry of stencil
+    ///////////////////////////////////////////////
+//    Stencil.HaloExchange(_in, Compressor);
+      std::vector<std::vector<CommsRequest_t> > requests;
+      Stencil.Prepare();
+  {
+    GRID_TRACE("Laplace Gather");
+    Stencil.HaloGather(_in,Compressor);
+  }
+
+  tracePush("Laplace Communication");
+  Stencil.CommunicateBegin(requests);
+  {
+    GRID_TRACE("MergeSHM");
+    Stencil.CommsMergeSHM(Compressor);
+  }
+    
+
+    ///////////////////////////////////
+    // Arithmetic expressions
+    ///////////////////////////////////
+//    auto st = Stencil.View(AcceleratorRead);
+    autoView( st     , Stencil    , AcceleratorRead);
+    auto buf = st.CommBuf();
+
+    autoView( in     , _in    , AcceleratorRead);
+    autoView( out    , _out   , AcceleratorWrite);
+    autoView( U     , Uds    , AcceleratorRead);
+
+    typedef typename Field::vector_object        vobj;
+    typedef decltype(coalescedRead(in[0]))    calcObj;
+    typedef decltype(coalescedRead(U[0](0))) calcLink;
+
+    const int      Nsimd = vobj::Nsimd();
+    const uint64_t NN = grid->oSites();
+
+    accelerator_for( ss, NN, Nsimd, {
+
+	StencilEntry *SE;
+	
+	const int lane=acceleratorSIMTlane(Nsimd);
+
+	calcObj chi;
+	calcObj res;
+	calcObj Uchi;
+	calcObj Utmp;
+	calcObj Utmp2;
+	calcLink UU;
+	calcLink Udag;
+	int ptype;
+
+	res                 = coalescedRead(in[ss])*(-8.0);
+
+
+        SE = st.GetEntry(ptype, 0, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(0,Xp);
+	}
+        SE = st.GetEntry(ptype, 1, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(1,Yp);
+	}
+        SE = st.GetEntry(ptype, 2, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(2,Zp);
+	}
+        SE = st.GetEntry(ptype, 3, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(3,Tp);
+	}
+        SE = st.GetEntry(ptype, 4, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(4,Xm);
+	}
+        SE = st.GetEntry(ptype, 5, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(5,Ym);
+	}
+        SE = st.GetEntry(ptype, 6, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(6,Zm);
+	}
+        SE = st.GetEntry(ptype, 7, ss);				 
+        if (SE->_is_local ) {
+	LEG_LOAD_MULT(7,Tm);
+	}
+
+	coalescedWrite(out[ss], res,lane);
+    });
+
+    Stencil.CommunicateComplete(requests);
+  tracePop("Communication");
+
+  {
+    GRID_TRACE("Merge");
+    Stencil.CommsMerge(Compressor);
+  }
+
+
+    accelerator_for( ss, NN, Nsimd, {
+
+	StencilEntry *SE;
+	
+	const int lane=acceleratorSIMTlane(Nsimd);
+
+	calcObj chi;
+	calcObj res;
+	calcObj Uchi;
+	calcObj Utmp;
+	calcObj Utmp2;
+	calcLink UU;
+	calcLink Udag;
+	int ptype;
+
+//	res                 = coalescedRead(in[ss])*(-8.0);
+	res                 = coalescedRead(out[ss]);
+
+        SE = st.GetEntry(ptype, 0, ss);				 
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(0,Xp);
+	}
+        SE = st.GetEntry(ptype, 1, ss);				 
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(1,Yp);
+	}
+        SE = st.GetEntry(ptype, 2, ss);				 
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(2,Zp);
+	}
+        SE = st.GetEntry(ptype, 3, ss);
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(3,Tp);
+	}
+        SE = st.GetEntry(ptype, 4, ss);
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(4,Xm);
+	}
+        SE = st.GetEntry(ptype, 5, ss);
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(5,Ym);
+	}
+        SE = st.GetEntry(ptype, 6, ss);
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(6,Zm);
+	}
+        SE = st.GetEntry(ptype, 7, ss);
+        if ((SE->_is_local )==0){
+	LEG_LOAD_MULT(7,Tm);
+	}
+
+	coalescedWrite(out[ss], res,lane);
+    });
+  };
+  virtual void  M(const Field &in, Field &out) {Mnew(in,out);};
   virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
   virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
diff --git a/Grid/qcd/utils/CovariantLaplacianRat.h b/Grid/qcd/utils/CovariantLaplacianRat.h
index d0eae17a..6cd2d927 100644
--- a/Grid/qcd/utils/CovariantLaplacianRat.h
+++ b/Grid/qcd/utils/CovariantLaplacianRat.h
@@ -189,12 +189,12 @@ public:
     
         for(int i =0;i<par.order;i++){
         QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
-    #if USE_CHRONO
+#if USE_CHRONO
         MinvMom[i] = Forecast(QuadOp, right_nu, prev_solns[nu]);
-    #endif
-    #ifndef MIXED_CG
+#endif
+#ifndef MIXED_CG
         CG(QuadOp,right_nu,MinvMom[i]);
-    #else
+#else
         QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2);
     //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2);
         MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp);
@@ -227,9 +227,9 @@ public:
     //    Laplacian.M(MinvGMom, LMinvGMom);
         MixedCG(right_nu,MinvMom[i]);
     #endif
-    #if USE_CHRONO
+#if USE_CHRONO
         prev_solns[nu].push_back(MinvGMom);
-    #endif
+#endif
     
         LapStencil.M(MinvMom[i], Gtemp2); LMinvMom=fac*Gtemp2;
         AMinvMom = par.a1[i]*LMinvMom;
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 5591fc69..7413a5b8 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -641,6 +641,342 @@ public:
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     return mflops_best;
   }
+
+  static double LaplaceNo(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+
+    typedef typename  PeriodicGimplF::Field GaugeFieldF;
+    typedef typename  PeriodicGimplF::LinkField GaugeLinkFieldF;
+    
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+      CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField> LapStencilF(FGrid);
+      QuadLinearOperator<CovariantAdjointLaplacianStencil<PeriodicGimplF,GaugeLinkFieldF>,GaugeLinkFieldF> QuadOpF(LapStencilF,c2,c1,1.);
+    
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+      GaugeLinkFieldF src_l = PeekIndex<LorentzIndex>(Umu, 0);
+
+      SU<Nc>::HotConfiguration(RNG4,Umu);
+      LapStencilF.GaugeImport(Umu);
+
+      GaugeLinkFieldF r_l(FGrid); 
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+//          QuadOpF.HermOp(src_l,r_l);
+	}
+	ConjugateGradient<GaugeLinkFieldF> CG(1e-7,10000,false);
+
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+          QuadOpF.HermOp(src_l,r_l);
+	}
+//	CG(QuadOpF,src_l,r_l);
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1146.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
+
+  static double Laplace(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark Laplace on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+//    typedef ImprovedStaggeredFermionF Action;
+//    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu); 
+
+//    typename Action::ImplParams params;
+//    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+//  PeriodicGimplF
+    typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF;
+
+    ///////// Source preparation ////////////
+    GaugeLinkFieldF src   (FGrid); random(RNG4,src);
+//    GaugeLinkFieldF src_e (FrbGrid);
+//    GaugeLinkFieldF src_o (FrbGrid);
+//    GaugeLinkFieldF r_e   (FrbGrid);
+//    GaugeLinkFieldF r_o   (FrbGrid);
+    GaugeLinkFieldF r_eo  (FGrid);
+  
+    {
+
+ //     pickCheckerboard(Even,src_e,src);
+ //     pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 1;
+      std::string fmt("G/O/C  ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+        CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField> LapStencilF(FGrid);
+        QuadLinearOperator<CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField>,PeriodicGimplF::LinkField> QuadOpF(LapStencilF,c2,c1,1.);
+        LapStencilF.GaugeImport(Umu);
+	
+
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using Stencil Nc Laplace" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+//	  Ds.DhopEO(src_o,r_e,DaggerNo);
+          QuadOpF.HermOp(src,r_eo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+//	  Ds.DhopEO(src_o,r_e,DaggerNo);
+          QuadOpF.HermOp(src,r_eo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+//	double flops=(1146.0*volume)/2;
+	double flops=(2*8*216.0*volume);
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per node   "<< mflops/NN<<std::endl;
+	FGrid->Barrier();
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Quad Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Quad Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+	FGrid->Barrier();
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
 };
 
 
@@ -662,19 +998,20 @@ int main (int argc, char ** argv)
   int do_comms =1;
 
   int sel=4;
-  std::vector<int> L_list({8,12,16,24,32});
+  std::vector<int> L_list({16,24,32});
   int selm1=sel-1;
 
   std::vector<double> wilson;
   std::vector<double> dwf4;
   std::vector<double> staggered;
+  std::vector<double> lap;
 
   int Ls=1;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   for(int l=0;l<L_list.size();l++){
-    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
+ //   wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
   }
 
   Ls=12;
@@ -682,8 +1019,8 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   for(int l=0;l<L_list.size();l++){
-    double result = Benchmark::DWF(Ls,L_list[l]) ;
-    dwf4.push_back(result);
+//    double result = Benchmark::DWF(Ls,L_list[l]) ;
+//    dwf4.push_back(result);
   }
 
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@@ -694,6 +1031,15 @@ int main (int argc, char ** argv)
     staggered.push_back(result);
   }
 
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Laplace QuadOp 4D " <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::Laplace(L_list[l]) ;
+//    exit(-42);
+    lap.push_back(result);
+  }
+
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;