Merge branch 'master' of https://github.com/paboyle/Grid into scidac1_2

Conflicts: lib/qcd/action/fermion/WilsonKernels.h tests/Make.inc
2025-11-21 06:59:32 +00:00 · 2015-12-15 11:11:59 -05:00
parent 284453c5e9 af855cc129
commit bc34b7e808
36 changed files with 9091 additions and 185 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,7 @@
 Makefile.in
 Makefile
 Config.h
+Config.h.in
 config.log
 config.status
 .deps
--- a/39
+++ b/39
@@ -1,22 +1,11 @@
-RECENT
---------------
-
-  - Clean up HMC                                                             -- DONE
-  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
-  - Simplified the integrators a bit.                                        -- DONE
-  - Multi-timescale looks broken and operating on single timescale for now.  -- DONE
-  - pass GaugeField as template param.                        -- DONE
-  - Reunitarise                                               -- DONE
-  - Force Gradient                                            -- DONE
-  - Prefer "RefreshInternal" or such like to "init" in naming -- DONE
-  - Parallel io improvements                                  -- DONE
-  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
-
 TODO:
 ---------------
 Policies:
+
 * Link smearing/boundary conds; Policy class based implementation ; framework more in place
+
 * Support different boundary conditions (finite temp, chem. potential ... )
+
 * Support different fermion representations? 
  - contained entirely within the integrator presently

@@ -28,6 +17,8 @@ Policies:

 - Lanczos

+- Audit oIndex usage for cb behaviour
+
 - Rectangle gauge actions.
  Iwasaki,
  Symanzik,
@@ -41,6 +32,12 @@ Policies:

 - FFTnD ?

+- Gparity; hand opt use template specialisation elegance to enable the optimised paths ?
+- Gparity force term; Gparity (R)HMC.
+- Random number state save restore
+- Mobius implementation clean up to rmove #if 0 stale code sequences
+- CG -- profile carefully, kernel fusion, whole CG performance measurements.
+
 ================================================================
 * Hacks and bug fixes to clean up and Audits
 ================================================================
@@ -90,6 +87,20 @@ Not sure of status of this -- reverify. Things are working nicely now though.
 ======================================================================
 ======================================================================
 ======================================================================
+RECENT
+---------------
+
+  - Clean up HMC                                                             -- DONE
+  - LorentzScalar<GaugeField> gets Gauge link type (cleaner).                -- DONE
+  - Simplified the integrators a bit.                                        -- DONE
+  - Multi-timescale looks broken and operating on single timescale for now.  -- DONE
+  - pass GaugeField as template param.                        -- DONE
+  - Reunitarise                                               -- DONE
+  - Force Gradient                                            -- DONE
+  - Prefer "RefreshInternal" or such like to "init" in naming -- DONE
+  - Parallel io improvements                                  -- DONE
+  - Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
+
 Done: Cayley, Partial , ContFrac force terms.

 DONE
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -68,11 +68,11 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){

      tmp = U[mu]*Cshift(src,mu+1,1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
@@ -111,13 +111,13 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu+1,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
    ref = -0.5*ref;
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -75,13 +75,13 @@ int main (int argc, char ** argv)
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
@@ -124,13 +124,13 @@ int main (int argc, char ** argv)
      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
      }

      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu,-1);
      for(int i=0;i<ref._odata.size();i++){
-	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
      }
    }
  }
--- a/8052
+++ b/8052
--- a/configure.ac
+++ b/configure.ac
@@ -164,6 +164,21 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

+AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
+
+case ${ac_CHROMA} in
+     yes)
+       echo Enabling tests regressing to Chroma
+     ;;
+     no)
+       echo Disabling tests regressing to Chroma
+     ;;
+     *)
+     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+     ;;
+esac
+
+AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

 ###################################################################
 # Checks for doxygen support
@@ -184,6 +199,7 @@ echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
+AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_OUTPUT

--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@

-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h
+HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/DenseMatrix.h ./algorithms/iterative/EigenSort.h ./algorithms/iterative/Francis.h ./algorithms/iterative/Householder.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/Matrix.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./PerfCount.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Avx512Asm.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h

-CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./PerfCount.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsAsm.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@@ -3,8 +3,11 @@
 #include <PerfCount.h>

 namespace Grid {
+
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
+
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
+#ifdef __linux__
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
@@ -23,6 +26,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+#endif
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -11,6 +11,8 @@
 #ifdef __linux__
 #include <syscall.h>
 #include <linux/perf_event.h>
+#else
+#include <sys/syscall.h>
 #endif
 namespace Grid {

@@ -30,6 +32,7 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,

 class PerformanceCounter {
 private:
+
  typedef struct { 
  public:
    uint32_t type;
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -11,7 +11,6 @@
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////

-typedef uint32_t Integer;

 #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
@@ -20,6 +19,8 @@ typedef uint32_t Integer;

 namespace Grid {

+  typedef uint32_t Integer;
+
  typedef  float  RealF;
  typedef  double RealD;
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -166,8 +166,9 @@ public:
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
-	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
-	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
+	int _fld  = _fdimensions[mu]/_processors[mu];
+	pcoor[mu] = gcoor[mu]/_fld;
+	lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@@ -176,8 +177,16 @@ public:
      std::vector<int> lcoor;
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
      rank = RankFromProcessorCoor(pcoor);
-      i_idx= iIndex(lcoor);
-      o_idx= oIndex(lcoor);
+
+      std::vector<int> cblcoor(lcoor);
+      for(int d=0;d<cblcoor.size();d++){
+	if( this->CheckerBoarded(d) ) {
+	  cblcoor[d] = lcoor[d]/2;
+	}
+      }
+
+      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
+      o_idx= oIndex(lcoor);// this implies divide by 2 on checkerdim
    }

    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -114,7 +114,7 @@ namespace QCD {
    // Apply Dw
    this->DW(psi,Din,DaggerYes); 

-    Meooe5D(Din,chi);
+    MeooeDag5D(Din,chi);

    int Ls=this->Ls;
    for(int s=0;s<Ls;s++){
@@ -163,7 +163,6 @@ namespace QCD {
    FermionField tmp(psi._grid);
    // Assemble the 5d matrix
    Meooe5D(psi,tmp); 
-
 #if 0
    std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
    for(int s=0;s<Ls;s++){
@@ -202,7 +201,7 @@ namespace QCD {
      this->DhopOE(psi,tmp,DaggerYes);
    }

-    Meooe5D(tmp,chi); 
+    MeooeDag5D(tmp,chi); 
 #if 0
    std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
    // Assemble the 5d matrix
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -3,9 +3,12 @@ namespace Grid {
 namespace QCD {

 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						  int sF,int sU,const FermionField &in, FermionField &out)
+WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
@@ -122,7 +125,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 };

 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					      int sF,int sU,const FermionField &in, FermionField &out)
 {
@@ -369,6 +372,16 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
  vstream(out._odata[sF],result*(-0.5));
 }

+#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+}
+#endif
+
  FermOpTemplateInstantiate(WilsonKernels);
  GparityFermOpTemplateInstantiate(WilsonKernels);

--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -28,11 +28,11 @@ namespace Grid {
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
-#if defined(AVX512) || defined(IMCI)
+
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
-#else
+#if 0
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
@@ -41,7 +41,7 @@ namespace Grid {
 #endif
 // doesn't seem to work with Gparity at the moment
 #undef HANDOPT
-#ifdef HANDOPT
+//#define HANDOPT
     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			       int sF,int sU,const FermionField &in, FermionField &out);
@@ -49,25 +49,9 @@ namespace Grid {
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				  int sF,int sU,const FermionField &in, FermionField &out);
-#else
-
-     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			       int sF,int sU,const FermionField &in, FermionField &out)
-     {
-       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-     }
-
-     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				  int sF,int sU,const FermionField &in, FermionField &out)
-     {
-       DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-     }
-#endif
-
-     WilsonKernels(const ImplParams &p= ImplParams()) : Base(p) {};

+     WilsonKernels(const ImplParams &p= ImplParams());
+     
    };

  }
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -282,7 +282,7 @@ namespace QCD {

 #ifdef HANDOPT
 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 }

 template<class Impl>
-void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out)
 {
@@ -767,6 +767,36 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
    vstream(ref()(3)(2),result_32*(-0.5));
  }
 }
-  FermOpTemplateInstantiate(WilsonKernels);
+#else 
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+}
+
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						   int ss,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+}
+
 #endif
+
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+							  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+							  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							  int ss,int sU,const FermionField &in, FermionField &out);
+
 }}
--- a/lib/qcd/spin/TwoSpinor.h
+++ b/lib/qcd/spin/TwoSpinor.h
@@ -45,13 +45,13 @@ namespace QCD {
  // To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) )
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
-      hspin(0)=fspin(0)-timesI(fspin(3));
-      hspin(1)=fspin(1)-timesI(fspin(2));
+      hspin(0)=fspin(0)+timesI(fspin(3));
+      hspin(1)=fspin(1)+timesI(fspin(2));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
-      hspin(0)=fspin(0)+timesI(fspin(3));
-      hspin(1)=fspin(1)+timesI(fspin(2));
+      hspin(0)=fspin(0)-timesI(fspin(3));
+      hspin(1)=fspin(1)-timesI(fspin(2));
    }

      //  0 0  0  -1  [0] -+ [3]
@@ -60,13 +60,13 @@ namespace QCD {
      // -1 0  0  0
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
-      hspin(0)=fspin(0)+fspin(3);
-      hspin(1)=fspin(1)-fspin(2);
+      hspin(0)=fspin(0)-fspin(3);
+      hspin(1)=fspin(1)+fspin(2);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
-      hspin(0)=fspin(0)-fspin(3);
-      hspin(1)=fspin(1)+fspin(2);
+      hspin(0)=fspin(0)+fspin(3);
+      hspin(1)=fspin(1)-fspin(2);
    }
 	    /*Gz
 	     *  0 0  i  0   [0]+-i[2]
@@ -76,14 +76,14 @@ namespace QCD {
 	     */
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
-      hspin(0)=fspin(0)-timesI(fspin(2));
-      hspin(1)=fspin(1)+timesI(fspin(3));
+      hspin(0)=fspin(0)+timesI(fspin(2));
+      hspin(1)=fspin(1)-timesI(fspin(3));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
-      hspin(0)=fspin(0)+timesI(fspin(2));
-      hspin(1)=fspin(1)-timesI(fspin(3));
+      hspin(0)=fspin(0)-timesI(fspin(2));
+      hspin(1)=fspin(1)+timesI(fspin(3));
    }
 	    /*Gt
 	     *  0 0  1  0 [0]+-[2]
@@ -94,14 +94,14 @@ namespace QCD {
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjTp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
-      hspin(0)=fspin(0)-fspin(2);
-      hspin(1)=fspin(1)-fspin(3);
+      hspin(0)=fspin(0)+fspin(2);
+      hspin(1)=fspin(1)+fspin(3);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
-      hspin(0)=fspin(0)+fspin(2);
-      hspin(1)=fspin(1)+fspin(3);
+      hspin(0)=fspin(0)-fspin(2);
+      hspin(1)=fspin(1)-fspin(3);
    }
 	    /*G5
 	     *  1 0  0  0 
@@ -157,32 +157,32 @@ namespace QCD {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=timesI(hspin(1));
-      fspin(3)=timesI(hspin(0));
+      fspin(2)=timesMinusI(hspin(1));
+      fspin(3)=timesMinusI(hspin(0));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=timesMinusI(hspin(1));
-      fspin(3)=timesMinusI(hspin(0));
+      fspin(2)=timesI(hspin(1));
+      fspin(3)=timesI(hspin(0));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)+=timesI(hspin(1));
-      fspin(3)+=timesI(hspin(0));
+      fspin(2)-=timesI(hspin(1));
+      fspin(3)-=timesI(hspin(0));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)-=timesI(hspin(1));
-      fspin(3)-=timesI(hspin(0));
+      fspin(2)+=timesI(hspin(1));
+      fspin(3)+=timesI(hspin(0));
    }

      //  0 0  0  -1  [0] -+ [3]
@@ -195,32 +195,32 @@ namespace QCD {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=-hspin(1);
-      fspin(3)= hspin(0);
+      fspin(2)= hspin(1);
+      fspin(3)=-hspin(0);//Unary minus?
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)= hspin(1);
-      fspin(3)=-hspin(0);//Unary minus?
+      fspin(2)=-hspin(1);
+      fspin(3)= hspin(0);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)-=hspin(1);
-      fspin(3)+=hspin(0);
+      fspin(2)+=hspin(1);
+      fspin(3)-=hspin(0);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)+=hspin(1);
-      fspin(3)-=hspin(0);
+      fspin(2)-=hspin(1);
+      fspin(3)+=hspin(0);
    }

 	    /*Gz
@@ -234,32 +234,32 @@ namespace QCD {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=     timesI(hspin(0));
-      fspin(3)=timesMinusI(hspin(1));
+      fspin(2)=timesMinusI(hspin(0));
+      fspin(3)=timesI(hspin(1));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=timesMinusI(hspin(0));
-      fspin(3)=timesI(hspin(1));
+      fspin(2)=     timesI(hspin(0));
+      fspin(3)=timesMinusI(hspin(1));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)+=timesI(hspin(0));
-      fspin(3)-=timesI(hspin(1));
+      fspin(2)-=timesI(hspin(0));
+      fspin(3)+=timesI(hspin(1));
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)-=timesI(hspin(0));
-      fspin(3)+=timesI(hspin(1));
+      fspin(2)+=timesI(hspin(0));
+      fspin(3)-=timesI(hspin(1));
    }
 	    /*Gt
 	     *  0 0  1  0 [0]+-[2]
@@ -272,32 +272,32 @@ namespace QCD {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=-hspin(0);
-      fspin(3)=-hspin(1);
+      fspin(2)=hspin(0);
+      fspin(3)=hspin(1);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)=hspin(0);
      fspin(1)=hspin(1);
-      fspin(2)=hspin(0);
-      fspin(3)=hspin(1);
+      fspin(2)=-hspin(0);
+      fspin(3)=-hspin(1);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)-=hspin(0);
-      fspin(3)-=hspin(1);
+      fspin(2)+=hspin(0);
+      fspin(3)+=hspin(1);
    }
  template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> strong_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
    {
      //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
      fspin(0)+=hspin(0);
      fspin(1)+=hspin(1);
-      fspin(2)+=hspin(0);
-      fspin(3)+=hspin(1);
+      fspin(2)-=hspin(0);
+      fspin(3)-=hspin(1);
    }
 	    /*G5
 	     *  1 0  0  0 
--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@@ -22,7 +22,12 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
    template <typename U>
-    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    typename std::enable_if<std::is_enum<U>::value, void>::type
+    write(const std::string& s, const U &output);
+    template <typename U>
+    typename std::enable_if<
+      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
+      void>::type
    write(const std::string& s, const U &output);
  private:
    T *upcast;
@@ -41,7 +46,12 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
-    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+    typename std::enable_if<std::is_enum<U>::value, void>::type
+    read(const std::string& s, U &output);
+    template <typename U>
+    typename std::enable_if<
+      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
+      void>::type
    read(const std::string& s, U &output);
  protected:
    template <typename U>
@@ -146,7 +156,17 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  typename std::enable_if<std::is_enum<U>::value, void>::type
+  Writer<T>::write(const std::string &s, const U &output)
+  {
+    EnumIO<U>::write(*this, s, output);
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<
+    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
+    void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    upcast->writeDefault(s, output);
@@ -181,7 +201,17 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
+  typename std::enable_if<std::is_enum<U>::value, void>::type
+  Reader<T>::read(const std::string &s, U &output)
+  {
+    EnumIO<U>::read(*this, s, output);
+  }
+  
+  template <typename T>
+  template <typename U>
+  typename std::enable_if<
+    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
+    void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    upcast->readDefault(s, output);
@@ -205,7 +235,7 @@ namespace Grid {
      abort();
    }
  }
-  
+
 }

 #endif
--- a/lib/serialisation/BinaryIO.cc
+++ b/lib/serialisation/BinaryIO.cc
@@ -1,36 +1,43 @@
 #include <Grid.h>

+using namespace Grid;
+using namespace std;

-namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-BinaryWriter::BinaryWriter(const std::string &fileName)
-: file_(fileName, std::ios::binary|std::ios::out)
+BinaryWriter::BinaryWriter(const string &fileName)
+: file_(fileName, ios::binary|ios::out)
 {}

 template <>
-void BinaryWriter::writeDefault(const std::string &s, const std::string &output)
+void BinaryWriter::writeDefault(const string &s, const string &x)
 {
-  uint64_t sz = output.size();
+    uint64_t sz = x.size();
+    
+    write("", sz);
+    for (uint64_t i = 0; i < sz; ++i)
+    {
+        write("", x[i]);
+    }
+}
+
+void BinaryWriter::writeDefault(const string &s, const char *x)
+{
+  string sx(x);
  
-  write("", sz);
-  for (uint64_t i = 0; i < sz; ++i)
-  {
-    write("", output[i]);
-  }
+  writeDefault(s, sx);
 }

 // Reader implementation ///////////////////////////////////////////////////////
-BinaryReader::BinaryReader(const std::string &fileName)
-: file_(fileName, std::ios::binary|std::ios::in)
+BinaryReader::BinaryReader(const string &fileName)
+: file_(fileName, ios::binary|ios::in)
 {}

 template <>
-void BinaryReader::readDefault(const std::string &s, std::string &output)
+void BinaryReader::readDefault(const string &s, string &output)
 {
-  uint64_t sz;
+    uint64_t sz;
  
-  read("", sz);
-  output.reserve(sz);
-  file_.read((char *)output.data(), sz);
-}
+    read("", sz);
+    output.resize(sz);
+    file_.read((char *)output.data(), sz);
 }
--- a/lib/serialisation/BinaryIO.h
+++ b/lib/serialisation/BinaryIO.h
@@ -22,6 +22,7 @@ namespace Grid {
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
    void writeDefault(const std::string &s, const std::vector<U> &x);
+    void writeDefault(const std::string &s, const char *x);
  private:
    std::ofstream file_;
  };
@@ -48,6 +49,9 @@ namespace Grid {
    file_.write((char *)&x, sizeof(U));
  }
  
+  template <>
+  void BinaryWriter::writeDefault(const std::string &s, const std::string &x);
+  
  template <typename U>
  void BinaryWriter::writeDefault(const std::string &s, const std::vector<U> &x)
  {
@@ -67,6 +71,9 @@ namespace Grid {
    file_.read((char *)&output, sizeof(U));
  }
  
+  template <>
+  void BinaryReader::readDefault(const std::string &s, std::string &output);
+  
  template <typename U>
  void BinaryReader::readDefault(const std::string &s, std::vector<U> &output)
  {
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@@ -109,12 +109,11 @@ THE SOFTWARE.
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #define GRID_MACRO_MEMBER(A,B)        A B;
-
 #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
 #define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);

-#define GRID_DECL_CLASS_MEMBERS(cname,...)		\
+#define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)		\
  \
  \
  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
@@ -144,4 +143,51 @@ THE SOFTWARE.
  };  


+
+#define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type
+#define GRID_MACRO_ENUMVAL(A,B) A = B,
+#define GRID_MACRO_ENUMCASE(A,B) case GRID_ENUM_TYPE(obj)::A: Grid::write(WR,s,#A); break;
+#define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;}
+#define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break;
+
+namespace Grid {
+  template <typename U>
+  class EnumIO {};
+}
+
+#define GRID_SERIALIZABLE_ENUM(name,undefname,...)\
+  enum class name {\
+      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
+      undefname = -1\
+  };\
+  \
+  template<>\
+  class EnumIO<name> {\
+    public:\
+      template <typename T>\
+      static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
+        switch (obj) {\
+          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
+          default: Grid::write(WR,s,#undefname); break;\
+        }\
+      }\
+      \
+      template <typename T>\
+      static void read(Reader<T> &RD,const std::string &s, name &obj){ \
+        std::string buf;\
+        Grid::read(RD, s, buf);\
+        if (buf == #undefname) {obj = name::undefname;}\
+        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
+        else {obj = name::undefname;}\
+      }\
+  };\
+  \
+  std::ostream & operator << (std::ostream &os, const name &obj ) { \
+    switch (obj) {\
+        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
+        default: os << #undefname; break;\
+    }\
+    return os;\
+  };
+
 #endif
--- a/lib/serialisation/TextIO.cc
+++ b/lib/serialisation/TextIO.cc
@@ -1,12 +1,14 @@
 #include <Grid.h>

-namespace Grid {
+using namespace Grid;
+using namespace std;
+
 // Writer implementation ///////////////////////////////////////////////////////
-TextWriter::TextWriter(const std::string &fileName)
-: file_(fileName, std::ios::out)
+TextWriter::TextWriter(const string &fileName)
+: file_(fileName, ios::out)
 {}

-void TextWriter::push(const std::string &s)
+void TextWriter::push(const string &s)
 {
  level_++;
 };
@@ -25,11 +27,11 @@ void TextWriter::indent(void)
 };

 // Reader implementation ///////////////////////////////////////////////////////
-TextReader::TextReader(const std::string &fileName)
-: file_(fileName, std::ios::in)
+TextReader::TextReader(const string &fileName)
+: file_(fileName, ios::in)
 {}

-void TextReader::push(const std::string &s)
+void TextReader::push(const string &s)
 {
  level_++;
 };
@@ -48,9 +50,9 @@ void TextReader::checkIndent(void)
    file_.get(c);
    if (c != '\t')
    {
-      std::cerr << "mismatch on tab " << c << " level " << level_;
-      std::cerr << " i "<< i <<std::endl;
-      std::abort();
+      cerr << "mismatch on tab " << c << " level " << level_;
+      cerr << " i "<< i << endl;
+      abort();
    }
  }
 }
@@ -58,8 +60,7 @@ void TextReader::checkIndent(void)
 template <>
 void TextReader::readDefault(const std::string &s, std::string &output)
 {
-  checkIndent();
-  output.clear();
-  getline(file_, output);
-}
+    checkIndent();
+    output.clear();
+    getline(file_, output);
 }
--- a/lib/serialisation/TextIO.h
+++ b/lib/serialisation/TextIO.h
@@ -20,9 +20,9 @@ namespace Grid
    void push(const std::string &s);
    void pop(void);
    template <typename U>
-    void writeDefault(const std::string &s, const U &output);
+    void writeDefault(const std::string &s, const U &x);
    template <typename U>
-    void writeDefault(const std::string &s, const std::vector<U> &output);
+    void writeDefault(const std::string &s, const std::vector<U> &x);
  private:
    void indent(void);
  private:
@@ -50,21 +50,21 @@ namespace Grid
  
  // Writer template implementation ////////////////////////////////////////////
  template <typename U>
-  void TextWriter::writeDefault(const std::string &s, const U &output)
+  void TextWriter::writeDefault(const std::string &s, const U &x)
  {
    indent();
-    file_ << std::boolalpha << output << std::endl;
+    file_ << std::boolalpha << x << std::endl;
  }
  
  template <typename U>
-  void TextWriter::writeDefault(const std::string &s, const std::vector<U> &output)
+  void TextWriter::writeDefault(const std::string &s, const std::vector<U> &x)
  {
-    uint64_t sz = output.size();
+    uint64_t sz = x.size();
    
    write(s, sz);
    for (uint64_t i = 0; i < sz; ++i)
    {
-      write(s, output[i]);
+      write(s, x[i]);
    }
  }
  
@@ -78,6 +78,9 @@ namespace Grid
    fromString(output, buf);
  }
  
+  template <>
+  void TextReader::readDefault(const std::string &s, std::string &output);
+  
  template <typename U>
  void TextReader::readDefault(const std::string &s, std::vector<U> &output)
  {
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -1,8 +1,10 @@
 #include <Grid.h>

-namespace Grid {
+using namespace Grid;
+using namespace std;
+
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const std::string &fileName)
+XmlWriter::XmlWriter(const string &fileName)
 : fileName_(fileName)
 {
  node_ = doc_.append_child();
@@ -14,7 +16,7 @@ XmlWriter::~XmlWriter(void)
  doc_.save_file(fileName_.c_str(), "  ");
 }

-void XmlWriter::push(const std::string &s)
+void XmlWriter::push(const string &s)
 {
  node_ = node_.append_child(s.c_str());
 }
@@ -25,22 +27,22 @@ void XmlWriter::pop(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const std::string &fileName)
+XmlReader::XmlReader(const string &fileName)
 : fileName_(fileName)
 {
  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
  
  if ( !result )
  {
-    std::cerr << "XML error description: " << result.description() << "\n";
-    std::cerr << "XML error offset     : " << result.offset        << "\n";
-    std::abort();
+    cerr << "XML error description: " << result.description() << "\n";
+    cerr << "XML error offset     : " << result.offset        << "\n";
+    abort();
  }
  
  node_ = doc_.child("grid");
 }

-void XmlReader::push(const std::string &s)
+void XmlReader::push(const string &s)
 {
  node_ = node_.child(s.c_str());
 }
@@ -51,8 +53,7 @@ void XmlReader::pop(void)
 }

 template <>
-void XmlReader::readDefault(const std::string &s, std::string &output)
+void XmlReader::readDefault(const string &s, string &output)
 {
  output = node_.child(s.c_str()).first_child().value();
 }
-}
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -81,10 +81,12 @@ namespace Grid
    fromString(output, buf);
  }
  
+  template <>
+  void XmlReader::readDefault(const std::string &s, std::string &output);
+  
  template <typename U>
  void XmlReader::readDefault(const std::string &s, std::vector<U> &output)
  {
-    pugi::xml_node nodeCpy;
    std::string    buf;
    unsigned int   i = 0;
    
@@ -96,7 +98,6 @@ namespace Grid
      node_.child("elem").set_name("elem-done");
      i++;
    }
-    //    assert( is.tellg()==-1);
    pop();
  }
  
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -16,6 +16,7 @@
 #define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 #endif

+namespace Grid {
 namespace Optimization {

  template<class vtype>
@@ -467,7 +468,7 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m256  SIMD_Ftype; // Single precision type
  typedef __m256d SIMD_Dtype; // Double precision type
  typedef __m256i SIMD_Itype; // Integer type
@@ -488,8 +489,8 @@ namespace Grid {
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -11,8 +11,6 @@ echo CCFILES=$CCFILES >> Make.inc

 cd ..

-
-
 cd tests

 echo> Make.inc
@@ -32,6 +30,26 @@ echo ${BNAME}_LDADD=-lGrid>> Make.inc
 echo >> Make.inc
 done

+cd qdpxx
+
+echo> Make.inc
+TESTS=`ls T*.cc`
+TESTLIST=`echo ${TESTS} | sed s/.cc//g `
+
+echo > Make.inc
+echo bin_PROGRAMS = ${TESTLIST} >> Make.inc
+echo >> Make.inc
+
+for f in $TESTS
+do
+BNAME=`basename $f .cc`
+echo >> Make.inc
+echo ${BNAME}_SOURCES=$f  >> Make.inc
+echo ${BNAME}_LDADD=-lGrid>> Make.inc
+echo >> Make.inc
+done
+
+cd ..
 cd ..


--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_synthetic_lanczos Test_gparity
+bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi 


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -98,8 +98,8 @@ Test_gparity_SOURCES=Test_gparity.cc
 Test_gparity_LDADD=-lGrid


-#Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
-#Test_gpwilson_even_odd_LDADD=-lGrid
+Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
+Test_gpwilson_even_odd_LDADD=-lGrid


 Test_hmc_EODWFRatio_SOURCES=Test_hmc_EODWFRatio.cc
@@ -225,3 +225,10 @@ Test_wilson_force_phiMdagMphi_LDADD=-lGrid
 Test_wilson_force_phiMphi_SOURCES=Test_wilson_force_phiMphi.cc
 Test_wilson_force_phiMphi_LDADD=-lGrid

+Test_zmm_SOURCES=Test_zmm.cc
+Test_zmm_LDADD=-lGrid
+
+Test_RectPlaq_SOURCES=Test_RectPlaq.cc
+Test_RectPlaq_LDADD=-lGrid
+
+
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,5 +1,12 @@
 # additional include paths necessary to compile the C++ library
+
+SUBDIRS = 
+if BUILD_CHROMA_REGRESSION
+  SUBDIRS+= qdpxx
+endif
+
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib

+
 include Make.inc
--- a/tests/Test_cayley_cg.cc
+++ b/tests/Test_cayley_cg.cc
@@ -66,7 +66,8 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);

-  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid);
+  SU3::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);

  RealD mass=0.1;
--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@@ -53,6 +53,7 @@ int main (int argc, char ** argv)


  TComplex cm;
+  TComplex cmeo;
  for(int dir=0;dir<Nd;dir++){
    //    if ( dir!=1 ) continue;
    for(int shift=0;shift<latt_size[dir];shift++){
@@ -125,7 +126,17 @@ int main (int argc, char ** argv)
 	  
 	  peekSite(cm,rbShiftU,coor);

-	  double nrm=norm2(U);
+	  Integer checkerboard = RBFine.CheckerBoard(coor);
+
+	  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
+	  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
+	  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
+	  if ( checkerboard == ShiftUo.checkerboard ) {
+	    peekSite(cmeo,ShiftUo,coor);
+	  } else { 
+	    peekSite(cmeo,ShiftUe,coor);
+	  }
+

 	  std::vector<int> scoor(coor);
 	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
@@ -136,13 +147,29 @@ int main (int argc, char ** argv)
 	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];

 	  Complex scm(slex);
-	  
-	  nrm = abs(scm-cm()()());
+
 	  std::vector<int> peer(4);
-	  Complex ctmp=cm;
+	  Complex ctmp=cmeo;
 	  Integer index=real(ctmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);

+	  double nrm = abs(cmeo()()()-scm);
+	  if (nrm != 0) {
+	    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Fine.CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+
+	  }
+
+	  ctmp=cm;
+	  index=real(ctmp);
+	  nrm = abs(scm-cm()()());
+
 	  if (nrm > 0){
 	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
--- a/tests/Test_dwf_even_odd.cc
+++ b/tests/Test_dwf_even_odd.cc
@@ -27,8 +27,8 @@ int main (int argc, char ** argv)

  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

  std::vector<int> seeds4({1,2,3,4});
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -9,7 +9,7 @@ using namespace Grid::QCD;
 class myclass: Serializable {
 public:

-  GRID_DECL_CLASS_MEMBERS(myclass,
+  GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
 			  int, domaindecompose,
 			  int, domainsize,
 			  int, order,
--- a/tests/Test_serialisation.cc
+++ b/tests/Test_serialisation.cc
@@ -1,30 +1,39 @@
 #include <Grid.h>

 namespace Grid {
+  
+  GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3);
+    
  class myclass: Serializable {
  public:
    
-    GRID_DECL_CLASS_MEMBERS(myclass,
+    GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
+                            myenum, e,
+                            std::vector<myenum>, ve,
+                            std::string, name,
                            int, x,
                            double, y,
                            bool , b,
-                            std::string, name,
                            std::vector<double>, array,
                            std::vector<std::vector<double>>, twodimarray,
                            );
    
    myclass() {}
    myclass(int i)
-    : array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456))
+    : array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456)), ve(2, myenum::blue)
    {
+      e=myenum::red;
      x=i;
      y=2*i;
      b=true;
      name="bother said pooh";
    }
  };
+  
 }

+using namespace Grid;
+
 int16_t i16 = 1;
 uint16_t u16 = 2;
 int32_t i32 = 3;
@@ -35,8 +44,6 @@ float    f = M_PI;
 double   d = 2*M_PI;
 bool     b = false;

-using namespace Grid;
-
 int main(int argc,char **argv)
 {
  {
@@ -59,6 +66,7 @@ int main(int argc,char **argv)
    myclass obj(1234); // non-trivial constructor
    write(WR,"obj",obj);
    WR.write("obj2", obj);
+    std::cout << obj << std::endl;
    
    std::vector<myclass> vec;
    vec.push_back(myclass(1234));
--- a/tests/qdpxx/Make.inc
+++ b/tests/qdpxx/Make.inc
@@ -0,0 +1,7 @@
+
+bin_PROGRAMS = Test_qdpxx_munprec
+
+
+Test_qdpxx_munprec_SOURCES=Test_qdpxx_munprec.cc
+Test_qdpxx_munprec_LDADD=-lGrid
+
--- a/tests/qdpxx/Makefile.am
+++ b/tests/qdpxx/Makefile.am
@@ -0,0 +1,7 @@
+# additional include paths necessary to compile the C++ library
+
+AM_CXXFLAGS = -I$(top_srcdir)/lib `chroma-config --cxxflags`
+AM_LDFLAGS = -L$(top_builddir)/lib `chroma-config --ldflags` `chroma-config --libs`
+
+
+include Make.inc
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -0,0 +1,604 @@
+#include <Grid.h>
+
+int    Ls=8;
+double M5=1.6;
+double mq=0.01;
+double zolo_lo = 0.1;
+double zolo_hi = 2.0;
+double mobius_scale=2.0;
+
+enum ChromaAction {
+                 DWF,           // CPS style preconditioning
+		 WilsonFermion, // Wilson
+		 HwPartFracZolo, // KEK's approach
+		 HwContFracZolo, // Edwards, Kennedy et al prefer this
+		 HwPartFracTanh, // 
+		 HwContFracTanh, // 
+		 HwCayleyZolo, // Chiu Optimal
+		 HtCayleyZolo, // 
+		 HmCayleyZolo, // Scaled shamir 13
+		 HwCayleyTanh, // Scaled shamir
+		 HtCayleyTanh, // Plain old DWF.
+		 HmCayleyTanh, // Scaled shamir 13
+		 HtContFracTanh,
+		 HtContFracZolo
+};
+
+void calc_grid      (ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag);
+void calc_chroma    (ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag);
+
+#include <chroma.h>
+#include <actions/ferm/invert/syssolver_linop_cg_array.h>
+#include <actions/ferm/invert/syssolver_linop_aggregate.h>
+
+
+
+namespace Chroma { 
+
+class ChromaWrapper {
+public:
+
+  
+  typedef multi1d<LatticeColorMatrix> U;
+  typedef LatticeFermion T4;
+  typedef multi1d<LatticeFermion> T5;
+  
+  static void ImportGauge(Grid::QCD::LatticeGaugeField & gr,
+			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
+  {
+    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+      cx[0] = x[0];
+      cx[1] = x[1];
+      cx[2] = x[2];
+      cx[3] = x[3];
+      Grid::peekSite(LCM,gr,x);
+
+      for(int mu=0;mu<4;mu++){
+	for(int i=0;i<3;i++){
+	for(int j=0;j<3;j++){
+	  cc = LCM(mu)()(i,j);
+	  c = QDP::cmplx(QDP::Real(real(cc)),QDP::Real(imag(cc)));
+	  QDP::pokeColor(cm,c,i,j);
+	}}
+	QDP::pokeSite(ch[mu],cm,cx);
+	/*
+	std::cout << "("<<x[0]<<",";
+	std::cout << x[1]<<",";
+	std::cout << x[2]<<",";
+	std::cout << x[3]<<") "<< Grid::norm2(LCM(mu)) << " " <<QDP::norm2(cm)<<std::endl ;
+	*/
+      }
+
+    }}}}
+  }
+  
+  static void ImportFermion(Grid::QCD::LatticeFermion & gr,
+			    QDP::multi1d<QDP::LatticeFermion> & ch  ) 
+  {
+    Grid::QCD::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(5);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+    for (x[4]=0;x[4]<gd[4];x[4]++){
+      int s = x[0];
+      cx[0] = x[1];
+      cx[1] = x[2];
+      cx[2] = x[3];
+      cx[3] = x[4];
+
+      Grid::peekSite(F,gr,x);
+
+      for(int j=0;j<3;j++){
+	for(int sp=0;sp<4;sp++){
+
+	  c= F()(sp)(j) ;
+
+	  cc = QDP::cmplx(QDP::Real(real(c)),QDP::Real(imag(c)));
+
+	  QDP::pokeSpin(cS,cc,sp);
+
+	}
+	QDP::pokeColor(cF,cS,j);
+      }
+      QDP::pokeSite(ch[s],cF,cx);
+    }}}}}
+  }
+  static void ExportFermion(Grid::QCD::LatticeFermion & gr,
+			    QDP::multi1d<QDP::LatticeFermion> & ch  ) 
+  {
+    Grid::QCD::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(5);
+    QDP::multi1d<int> cx(4);
+    std::vector<int> gd= gr._grid->GlobalDimensions();
+
+    for (x[0]=0;x[0]<gd[0];x[0]++){
+    for (x[1]=0;x[1]<gd[1];x[1]++){
+    for (x[2]=0;x[2]<gd[2];x[2]++){
+    for (x[3]=0;x[3]<gd[3];x[3]++){
+    for (x[4]=0;x[4]<gd[4];x[4]++){
+      int s = x[0];
+      cx[0] = x[1];
+      cx[1] = x[2];
+      cx[2] = x[3];
+      cx[3] = x[4];
+
+      cF = QDP::peekSite(ch[s],cx);
+      for(int sp=0;sp<4;sp++){
+	for(int j=0;j<3;j++){
+	  cS =QDP::peekColor(cF,j);
+	  cc =QDP::peekSpin(cS,sp);
+	  c = Grid::Complex(QDP::toDouble(QDP::real(cc)), 
+			    QDP::toDouble(QDP::imag(cc)));
+	  F()(sp)(j) = c;
+	}
+      }
+      Grid::pokeSite(F,gr,x);
+    }}}}}
+  }
+
+  static Handle< LinearOperatorArray<T4> >  GetLinOp (U &u, ChromaAction parms)
+  {
+    QDP::Real _mq(mq);
+    QDP::Real eps_lo(zolo_lo);
+    QDP::Real eps_hi(zolo_hi);
+    QDP::Real scale(mobius_scale);
+
+    QDP::multi1d<int> bcs(QDP::Nd);
+
+    bcs[0] = bcs[1] = bcs[2] = bcs[3] = 1;
+
+    Chroma::Handle<Chroma::FermBC<T4,U,U> > fbc(new Chroma::SimpleFermBC< T4, U, U >(bcs));
+    Chroma::Handle<Chroma::CreateFermState<T4,U,U> > cfs( new Chroma::CreateSimpleFermState<T4,U,U>(fbc));
+
+
+    Chroma::GroupXML_t invparm;
+    invparm.xml=std::string(
+"   <InvertParam>\n"
+"   <invType>CG_INVERTER</invType>\n"
+"   <RsdCG>1.0e-9</RsdCG>\n"
+"   <MaxCG>3000</MaxCG>\n"
+"   </InvertParam>"
+);
+
+    invparm.id=std::string("CG_INVERTER");
+    invparm.path=std::string("/InvertParam");
+
+    if ( (parms == HtCayleyTanh)|| (parms==DWF) ) {
+      Chroma::UnprecDWFermActArray  S_f(cfs, M5, _mq, Ls);
+      Chroma::Handle< Chroma::FermState<T4,U,U> > fs( S_f.createState(u) );
+      Chroma::Handle< Chroma::LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+     return  M;
+    }
+    if ( parms == HwCayleyTanh ) {
+      QDP::Real b5 = 0.5;
+      QDP::Real c5 = 0.5;
+      Chroma::UnprecNEFFermActArray  S_f(cfs, M5,b5,c5, _mq, Ls);
+      Chroma::Handle< Chroma::FermState<T4,U,U> > fs( S_f.createState(u) );
+      Chroma::Handle< Chroma::LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+      return  M;
+    }
+    if ( parms == HmCayleyTanh ) {
+      Real b5 = 0.5*(scale +1.0);
+      Real c5 = 0.5*(scale -1.0);
+      UnprecNEFFermActArray  S_f(cfs, M5,b5,c5, _mq, Ls);
+      Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+      Handle< LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+      return  M;
+    }
+   if ( parms == HwCayleyZolo ) {
+     UnprecZoloNEFFermActArrayParams params;
+     params.OverMass=M5;
+     params.Mass=_mq;
+     params.b5=0.5;
+     params.c5=0.5;
+     params.N5=Ls;
+     params.approximation_type = COEFF_TYPE_ZOLOTAREV;
+     params.ApproxMin=eps_lo;
+     params.ApproxMax=eps_hi;
+     UnprecZoloNEFFermActArray  S_f(cfs, params);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+     return  M;
+   }
+   if ( parms == HtCayleyZolo ) {
+     UnprecZoloNEFFermActArrayParams params;
+     params.OverMass=M5;
+     params.Mass=_mq;
+     params.b5=1.0;
+     params.c5=0.0;
+     params.N5=Ls;
+     params.approximation_type = COEFF_TYPE_ZOLOTAREV;
+     params.ApproxMin=eps_lo;
+     params.ApproxMax=eps_hi;
+     UnprecZoloNEFFermActArray  S_f(cfs, params);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+     return M;
+   }
+   if ( parms == HmCayleyZolo ) {
+     UnprecZoloNEFFermActArrayParams params;
+     params.OverMass=M5;
+     params.Mass=_mq;
+     params.b5= 0.5*(mobius_scale +1.0);
+     params.c5= 0.5*(mobius_scale -1.0);
+     params.N5=Ls;
+     params.approximation_type = COEFF_TYPE_ZOLOTAREV;
+     params.ApproxMin=eps_lo;
+     params.ApproxMax=eps_hi;
+     UnprecZoloNEFFermActArray  S_f(cfs, params);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,_mq));
+     return M;
+   }
+   if ( parms == HwPartFracZolo ) {
+     if ( Ls%2 == 0 ) { 
+       printf("Ls is not odd\n");
+       exit(-1);
+     }
+     UnprecOvExtFermActArrayParams param;
+     param.OverMass=M5; 
+     param.Mass=_mq;
+     param.RatPolyDeg = Ls;
+     param.ApproxMin =eps_lo;
+     param.ApproxMax =eps_hi;
+     param.b5 =1.0;
+     param.c5 =1.0;
+     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
+     //     param.approximation_type=COEFF_TYPE_TANH_UNSCALED;
+     //     param.approximation_type=COEFF_TYPE_TANH;
+     param.tuning_strategy_xml=
+"<TuningStrategy><Name>OVEXT_CONSTANT_STRATEGY</Name></TuningStrategy>\n";
+     UnprecOvExtFermActArray S_f(cfs,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return M;
+   }
+   if ( parms == HwContFracZolo ) {
+     UnprecOvlapContFrac5DFermActParams param;
+     param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct
+     param.ApproxMin=eps_lo;
+     param.ApproxMax=eps_hi;
+     param.approximation_type=COEFF_TYPE_ZOLOTAREV;
+     param.RatPolyDeg=Ls;
+     // The following is why I think Chroma made some directional errors:
+     param.AuxFermAct= std::string(
+"<AuxFermAct>\n"
+"  <FermAct>UNPRECONDITIONED_WILSON</FermAct>\n"
+"  <Mass>-1.8</Mass>\n"
+"  <b5>1</b5>\n"
+"  <c5>0</c5>\n"
+"  <MaxCG>1000</MaxCG>\n"
+"  <RsdCG>1.0e-9</RsdCG>\n"
+"  <FermionBC>\n"
+"      <FermBC>SIMPLE_FERMBC</FermBC>\n"
+"      <boundary>1 1 1 1</boundary>\n"
+"   </FermionBC> \n"
+"</AuxFermAct>"
+);
+     param.AuxFermActGrp= std::string("");
+     UnprecOvlapContFrac5DFermActArray S_f(fbc,param);
+     Handle< FermState<T4,U,U> > fs( S_f.createState(u) );
+     Handle< LinearOperatorArray<T4> > M(S_f.linOp(fs));
+     return  M;
+   }
+   assert(0);
+  }
+
+  static Chroma::Handle< Chroma::SystemSolver<QDP::LatticeFermion> > GetSolver(QDP::multi1d<QDP::LatticeColorMatrix> &u, ChromaAction parms)
+  {
+    QDP::multi1d<int> bcs(Nd);
+    bcs[0] = bcs[1] = bcs[2] = bcs[3] = 1;
+    Chroma::Handle<Chroma::FermBC<T4,U,U> > fbc(new Chroma::SimpleFermBC< T4, U, U >(bcs));
+    Chroma::Handle<Chroma::CreateFermState<T4,U,U> > cfs( new Chroma::CreateSimpleFermState<T4,U,U>(fbc));
+    
+    Chroma::GroupXML_t invparm;
+    invparm.xml=std::string(
+			    "   <InvertParam>\n"
+			    "   <invType>CG_INVERTER</invType>\n"
+			    "   <RsdCG>1.0e-10</RsdCG>\n"
+			    "   <MaxCG>3000</MaxCG>\n"
+			    "   </InvertParam>"
+			    );
+    invparm.id=std::string("CG_INVERTER");
+    invparm.path=std::string("/InvertParam");
+    
+    Chroma::UnprecDWFermActArray  S_f(cfs, M5, mq, Ls);
+    std::cout << "GetSolver: DWF 4d prec "<<std::endl;
+    std::cout << "GetSolver: M5 "<<M5<<std::endl;
+    std::cout << "GetSolver: mq "<<mq<<std::endl;
+    std::cout << "GetSolver: Ls "<<Ls<<std::endl;
+    Chroma::Handle< Chroma::FermState<T4,U,U> > fs( S_f.createState(u) );
+    Chroma::Handle< LinearOperatorArray<T4> > M(S_f.unprecLinOp(fs,mq));
+    return  S_f.qprop(fs,invparm);
+  }
+};
+}
+
+int main (int argc,char **argv )
+{
+
+  /********************************************************
+   * Setup QDP
+   *********************************************************/
+  Chroma::initialize(&argc,&argv);
+  Chroma::WilsonTypeFermActs4DEnv::registerAll(); 
+
+  /********************************************************
+   * Setup Grid
+   *********************************************************/
+  Grid::Grid_init(&argc,&argv);
+  Grid::GridCartesian * UGrid   = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
+									    Grid::GridDefaultSimd(Grid::QCD::Nd,Grid::vComplex::Nsimd()),
+									    Grid::GridDefaultMpi());
+  
+  std::vector<int> gd = UGrid->GlobalDimensions();
+  QDP::multi1d<int> nrow(QDP::Nd);
+  for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu];
+
+  QDP::Layout::setLattSize(nrow);
+  QDP::Layout::create();
+
+  Grid::GridCartesian         * FGrid   = Grid::QCD::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  Grid::QCD::LatticeGaugeField lat(UGrid);
+  Grid::QCD::LatticeFermion    src(FGrid);
+  Grid::QCD::LatticeFermion    res_chroma(FGrid);
+  Grid::QCD::LatticeFermion    res_grid  (FGrid);
+  
+  std::vector<ChromaAction> ActionList({
+		 HtCayleyTanh, // Plain old DWF.
+		 HmCayleyTanh,
+		 HwCayleyTanh,
+		 HtCayleyZolo, // Plain old DWF.
+		 HmCayleyZolo,
+		 HwCayleyZolo
+  });
+  std::vector<std::string> ActionName({
+        "HtCayleyTanh",
+	"HmCayleyTanh",
+	"HwCayleyTanh",
+	"HtCayleyZolo",
+	"HmCayleyZolo",
+        "HwCayleyZolo"
+  });
+
+  for(int i=0;i<ActionList.size();i++) {
+    std::cout << "*****************************"<<std::endl;
+    std::cout << "Action "<<ActionName[i]<<std::endl;
+    std::cout << "*****************************"<<std::endl;
+    for(int dag=0;dag<2;dag++) {
+
+
+      std::cout << "Dag =  "<<dag<<std::endl;
+      
+      calc_grid  (ActionList[i],lat,src,res_grid,dag);
+      
+      std::cout << "Norm of Grid DWF multiply "<<Grid::norm2(res_grid)<<std::endl;
+      
+      calc_chroma(ActionList[i],lat,src,res_chroma,dag);
+      
+      std::cout << "Norm of chroma DWF multiply "<<Grid::norm2(res_chroma)<<std::endl;
+      
+      res_chroma=res_chroma - res_grid;
+      
+      std::cout << "Norm of difference "<<Grid::norm2(res_chroma)<<std::endl;
+    }
+  }
+
+  std::cout << "Finished test "<<std::endl;
+
+  Chroma::finalize();
+}
+
+void calc_chroma(ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag)
+{
+  QDP::multi1d<QDP::LatticeColorMatrix> u(4);
+
+  //  Chroma::HotSt(u);
+  Chroma::ChromaWrapper::ImportGauge(lat,u) ;
+
+  int lx = QDP::Layout::subgridLattSize()[0];
+  int ly = QDP::Layout::subgridLattSize()[1];
+  int lz = QDP::Layout::subgridLattSize()[2];
+  int lt = QDP::Layout::subgridLattSize()[3];
+
+  QDP::multi1d<int> procs = QDP::Layout::logicalSize();
+
+  QDP::multi1d<QDP::LatticeFermion>  check(Ls);
+  QDP::multi1d<QDP::LatticeFermion> result(Ls);
+  QDP::multi1d<QDP::LatticeFermion>  psi(Ls);
+
+  Chroma::ChromaWrapper::ImportFermion(src,psi);
+
+  for(int mu=0;mu<4;mu++){
+    std::cout <<"Imported Gauge norm ["<<mu<<"] "<< QDP::norm2(u[mu])<<std::endl;
+  }
+  std::cout <<"Imported Fermion norm "<< QDP::norm2(psi)<<std::endl;
+
+  typedef QDP::LatticeFermion T;
+  typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
+  
+  auto linop =Chroma::ChromaWrapper::GetLinOp(u, action);
+
+  printf("Calling Chroma Linop\n"); fflush(stdout);
+
+  if ( dag ) 
+    (*linop)(check,psi,Chroma::MINUS);
+  else
+    (*linop)(check,psi,Chroma::PLUS);
+
+  printf("Called Chroma Linop\n"); fflush(stdout);
+
+  Chroma::ChromaWrapper::ExportFermion(res,check) ;
+}
+
+
+
+void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu._grid;
+  Grid::GridCartesian         * FGrid   = (Grid::GridCartesian *) src._grid;
+  Grid::GridRedBlackCartesian * UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  Grid::GridRedBlackCartesian * FrbGrid = Grid::QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  Grid::GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+
+  Grid::gaussian(RNG5,src);
+  Grid::gaussian(RNG5,res);
+
+  Grid::QCD::SU3::HotConfiguration(RNG4,Umu);
+
+  /*
+  Grid::QCD::LatticeColourMatrix U(UGrid);
+  U=Grid::zero;
+  for(int nn=0;nn<Grid::QCD::Nd;nn++){
+    if ( nn>=4 ) {
+      Grid::PokeIndex<LorentzIndex>(Umu,U,nn);
+    }
+  }
+  */
+
+  Grid::RealD _mass=mq;
+  Grid::RealD _M5  =M5;
+
+  if ( action == HtCayleyTanh ) { 
+
+    Grid::QCD::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
+
+    std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<<std::endl;
+
+    if ( dag ) 
+      Ddwf.Mdag(src,res);  
+    else 
+      Ddwf.M(src,res);  
+    return;
+
+  } 
+
+  if ( action == HmCayleyZolo ) {
+
+    Grid::Real _b = 0.5*(mobius_scale +1.0);
+    Grid::Real _c = 0.5*(mobius_scale -1.0);
+    Grid::QCD::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
+
+    std::cout << Grid::GridLogMessage <<" Calling mobius zolo multiply "<<std::endl;
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HtCayleyZolo ) {
+
+    Grid::QCD::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+
+    std::cout << Grid::GridLogMessage <<" Calling shamir zolo multiply "<<std::endl;
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+  }
+
+  /*
+  if ( action == HmCayleyTanh ) {
+    Grid::Real _b = 0.5*(mobius_scale +1.0);
+    Grid::Real _c = 0.5*(mobius_scale -1.0);
+    Grid::QCD::MobiusFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c);
+
+    std::cout << Grid::GridLogMessage <<" Calling mobius tanh multiply "<<std::endl;
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+
+  }
+  */
+
+  if ( action == HmCayleyTanh ) {
+
+    Grid::QCD::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
+
+    std::cout << Grid::GridLogMessage <<" Calling scaled shamir multiply "<<std::endl;
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwCayleyTanh ) {
+
+    Grid::QCD::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+  }
+
+  if ( action == HwCayleyZolo ) {
+
+    Grid::QCD::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+
+    if ( dag ) 
+      D.Mdag(src,res);  
+    else 
+      D.M(src,res);  
+
+    return;
+  }
+  
+  assert(0);
+}
+
+
+
+