From 0097b81778905afa7b391565b6040a9c96103618 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 13 May 2015 10:59:22 +0100
Subject: [PATCH] OMP dslash working

---
 TODO                           | 77 ++++++++++++++++++----------------
 benchmarks/Grid_wilson.cc      |  2 +-
 lib/qcd/Grid_qcd_wilson_dop.cc | 26 ++++++++----
 lib/qcd/Grid_qcd_wilson_dop.h  | 12 +++---
 4 files changed, 64 insertions(+), 53 deletions(-)
diff --git a/TODO b/TODO
index a75c867d..ed4dedd4 100644
--- a/TODO
+++ b/TODO
@@ -8,60 +8,39 @@
 
 * const audit
 
+Insert/Extract
 * Replace vset with a call to merge.; 
 * care in Gmerge,Gextract over vset .
 * extract / merge extra implementation removal      
-
-* Strong test for norm2, conj and all primitive types. -- tests/Grid_simd.cc is almost there
+* Optimise the extract/merge SIMD routines; Azusa??
+ - I have collated into single location at least.
+ - Need to use _mm_*insert/extract routines.
 
 * Thread scaling tests Xeon, XeonPhi
 
-================================================================
-*** New Functionality
-================================================================
-
-* Implement where within expression template scheme.
-
-* - BinaryWriter, TextWriter etc...
-  - use protocol buffers? replace xmlReader/Writer ec..
-  - Binary use htonll, htonl
-
-* Expression template engine: -- DONE
-   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
-
-* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
-
 ** Make the Tensor types and Complex etc... play more nicely.
   - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
-
   QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
   want to introduce a syntax that does not require this.
+
   - Reductions that contract indices on a site should always demote the tensor structure.
     norm2(), innerProduct.
+
   - Result of Sum(), SliceSum // spatial sums
               trace, traceIndex etc.. do not.
+
   - problem arises because "trace" returns Lattice<TComplex> moving everything down to Scalar,
     and then Sum and SliceSum to not remove the Scalars. This would be fixed if we 
     template specialize the scalar scalar scalar sum and SliceSum,  on the basis of being
     pure scalar.
 
-* Optimise the extract/merge SIMD routines; Azusa??
- - I have collated into single location at least.
- - Need to use _mm_*insert/extract routines.
-
-* Flavour matrices?
-* Pauli, SU subgroup, etc.. 
-* su3 exponentiation & log etc.. [Jamie's code?]
-* TaProj
-* FFTnD ?
-
-* Parallel io improvements
-  - optional parallel MPI2 IO
-  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
-  
-* rb4d support for 5th dimension in Mobius.
+*** Expression template engine: -- DONE
 
+[   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
+* Strong test for norm2, conj and all primitive types. -- tests/Grid_simd.cc is almost there
+* Implement where within expression template scheme.
 * Check for missing functionality                    - partially audited against QDP++ layout
+
    // Unary functions
    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
    // exp, log, sqrt, fabs
@@ -69,7 +48,21 @@
    // adjColor, adjSpin,
    // copyMask.
    // localMaxAbs
-   // Fourier transform equivalent.
+   // Fourier transform equivalent.]
+
+================================================================
+*** New Functionality
+================================================================
+
+* - BinaryWriter, TextWriter etc...
+  - use protocol buffers? replace xmlReader/Writer ec..
+  - Binary use htonll, htonl
+
+* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
+
+* Parallel io improvements
+  - optional parallel MPI2 IO
+  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
 
 Actions -- coherent framework for implementing actions and their forces.
 
@@ -80,9 +73,6 @@ Actions -- coherent framework for implementing actions and their forces.
   - Mobius
   - z-Mobius
 
-* Gauge
-  - Wilson, symanzik, iwasaki
-
 Algorithms (lots of reuse/port from BFM)
 * LinearOperator
 * LinearSolver
@@ -97,8 +87,21 @@ Algorithms (lots of reuse/port from BFM)
 * HDCG
 * HMC, 
 * Heatbath
+* Integrators, leapfrog, omelyan, force gradient etc...
 * etc..
 
+* Gauge
+  - Wilson, symanzik, iwasaki
+
+* rb4d support for 5th dimension in Mobius.
+
+* Flavour matrices?
+* Pauli, SU subgroup, etc.. 
+* su3 exponentiation & log etc.. [Jamie's code?]
+* TaProj
+* FFTnD ?
+
+
 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
 ======================================================================================================
diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc
index 9ce13090..7718eb19 100644
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
   int ncall=1000;
   double t0=usecond();
   for(int i=0;i<ncall;i++){
-    Dw.multiply(src,result);
+    Dw.M(src,result);
   }
   double t1=usecond();
   double flops=1320*volume*ncall;
diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc
index 25f1d914..14c48cf2 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -85,7 +85,17 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
   }
 }
 
-void WilsonMatrix::multiply(const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
+{
+  Dhop(in,out);
+  return;
+}
+void WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
+{
+  Dhop(in,out);
+  return;
+}
+void WilsonMatrix::MdagM(const LatticeFermion &in, LatticeFermion &out)
 {
   Dhop(in,out);
   return;
@@ -96,18 +106,18 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
   WilsonCompressor compressor;
   Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 
-  vHalfSpinColourVector  tmp;    
-  vHalfSpinColourVector  chi;    
-  vSpinColourVector result;
-  vHalfSpinColourVector Uchi;
-  int offset,local,perm, ptype;
-
 PARALLEL_FOR_LOOP
   for(int sss=0;sss<grid->oSites();sss++){
 
+    vHalfSpinColourVector  tmp;    
+    vHalfSpinColourVector  chi;    
+    vSpinColourVector result;
+    vHalfSpinColourVector Uchi;
+    int offset,local,perm, ptype;
+
+    //    int ss = Stencil._LebesgueReorder[sss];
     int ss = sss;
     int ssu= ss;
-    //    int ss = Stencil._LebesgueReorder[sss];
 
     // Xp
     offset = Stencil._offsets [Xp][ss];
diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h
index 900f1801..e19cbabe 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -3,16 +3,12 @@
 
 #include <Grid.h>
 
+#include <algorithms/LinearOperator.h>
+
 namespace Grid {
 
   namespace QCD {
 
-
-    template<class vtype> class LinearOperatorBase {
-    public:
-      void multiply(const Lattice<vtype> &in, Lattice<vtype> &out){ assert(0);}
-    };
-
     class WilsonMatrix : public LinearOperatorBase<LatticeFermion>
     {
       //NB r=1;
@@ -40,7 +36,9 @@ namespace Grid {
       void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
 
       // override multiply
-      void multiply(const LatticeFermion &in, LatticeFermion &out);
+      virtual void M    (const LatticeFermion &in, LatticeFermion &out);
+      virtual void Mdag (const LatticeFermion &in, LatticeFermion &out);
+      virtual void MdagM(const LatticeFermion &in, LatticeFermion &out);
 
       // non-hermitian hopping term; half cb or both
       void Dhop(const LatticeFermion &in, LatticeFermion &out);