OMP dslash working

2025-10-29 19:14:33 +00:00 · 2015-05-13 10:59:22 +01:00
parent 457cc0d5a3
commit 7f3ae64a31
4 changed files with 64 additions and 53 deletions
--- a/77
+++ b/77
@@ -8,60 +8,39 @@
 * const audit
 Insert/Extract
 * Replace vset with a call to merge.; 
 * care in Gmerge,Gextract over vset .
 * extract / merge extra implementation removal      
-
+* Optimise the extract/merge SIMD routines; Azusa??
-* Strong test for norm2, conj and all primitive types. -- tests/Grid_simd.cc is almost there
+ - I have collated into single location at least.
 - Need to use _mm_*insert/extract routines.
 * Thread scaling tests Xeon, XeonPhi
 ================================================================
 *** New Functionality
 ================================================================
 * Implement where within expression template scheme.
 * - BinaryWriter, TextWriter etc...
  - use protocol buffers? replace xmlReader/Writer ec..
  - Binary use htonll, htonl
 * Expression template engine: -- DONE
   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
 ** Make the Tensor types and Complex etc... play more nicely.
  - TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
  QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
  want to introduce a syntax that does not require this.
  - Reductions that contract indices on a site should always demote the tensor structure.
    norm2(), innerProduct.
  - Result of Sum(), SliceSum // spatial sums
              trace, traceIndex etc.. do not.
  - problem arises because "trace" returns Lattice<TComplex> moving everything down to Scalar,
    and then Sum and SliceSum to not remove the Scalars. This would be fixed if we 
    template specialize the scalar scalar scalar sum and SliceSum,  on the basis of being
    pure scalar.
-* Optimise the extract/merge SIMD routines; Azusa??
+*** Expression template engine: -- DONE
 - I have collated into single location at least.
 - Need to use _mm_*insert/extract routines.
 * Flavour matrices?
 * Pauli, SU subgroup, etc.. 
 * su3 exponentiation & log etc.. [Jamie's code?]
 * TaProj
 * FFTnD ?
 * Parallel io improvements
  - optional parallel MPI2 IO
  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
 * rb4d support for 5th dimension in Mobius.
 [   -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice<vobj>
 * Strong test for norm2, conj and all primitive types. -- tests/Grid_simd.cc is almost there
 * Implement where within expression template scheme.
 * Check for missing functionality                    - partially audited against QDP++ layout
   // Unary functions
   // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
   // exp, log, sqrt, fabs
@@ -69,7 +48,21 @@
   // adjColor, adjSpin,
   // copyMask.
   // localMaxAbs
-   // Fourier transform equivalent.
+   // Fourier transform equivalent.]
 ================================================================
 *** New Functionality
 ================================================================
 * - BinaryWriter, TextWriter etc...
  - use protocol buffers? replace xmlReader/Writer ec..
  - Binary use htonll, htonl
 * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
 * Parallel io improvements
  - optional parallel MPI2 IO
  - move Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test.
 Actions -- coherent framework for implementing actions and their forces.
@@ -80,9 +73,6 @@ Actions -- coherent framework for implementing actions and their forces.
  - Mobius
  - z-Mobius
 * Gauge
  - Wilson, symanzik, iwasaki
 Algorithms (lots of reuse/port from BFM)
 * LinearOperator
 * LinearSolver
@@ -97,8 +87,21 @@ Algorithms (lots of reuse/port from BFM)
 * HDCG
 * HMC, 
 * Heatbath
 * Integrators, leapfrog, omelyan, force gradient etc...
 * etc..
 * Gauge
  - Wilson, symanzik, iwasaki
 * rb4d support for 5th dimension in Mobius.
 * Flavour matrices?
 * Pauli, SU subgroup, etc.. 
 * su3 exponentiation & log etc.. [Jamie's code?]
 * TaProj
 * FFTnD ?
 ======================================================================================================
 FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
 ======================================================================================================
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
-    Dw.multiply(src,result);
+    Dw.M(src,result);
  }
  double t1=usecond();
  double flops=1320*volume*ncall;
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -85,7 +85,17 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
  }
 }
-void WilsonMatrix::multiply(const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
  Dhop(in,out);
  return;
 }
 void WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
  Dhop(in,out);
  return;
 }
 void WilsonMatrix::MdagM(const LatticeFermion &in, LatticeFermion &out)
 {
  Dhop(in,out);
  return;
@@ -96,18 +106,18 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
  WilsonCompressor compressor;
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
  vHalfSpinColourVector  tmp;    
  vHalfSpinColourVector  chi;    
  vSpinColourVector result;
  vHalfSpinColourVector Uchi;
  int offset,local,perm, ptype;
 PARALLEL_FOR_LOOP
  for(int sss=0;sss<grid->oSites();sss++){
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
    vSpinColourVector result;
    vHalfSpinColourVector Uchi;
    int offset,local,perm, ptype;
    //    int ss = Stencil._LebesgueReorder[sss];
    int ss = sss;
    int ssu= ss;
    //    int ss = Stencil._LebesgueReorder[sss];
    // Xp
    offset = Stencil._offsets [Xp][ss];
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -3,16 +3,12 @@
 #include <Grid.h>
 #include <algorithms/LinearOperator.h>
 namespace Grid {
  namespace QCD {
    template<class vtype> class LinearOperatorBase {
    public:
      void multiply(const Lattice<vtype> &in, Lattice<vtype> &out){ assert(0);}
    };
    class WilsonMatrix : public LinearOperatorBase<LatticeFermion>
    {
      //NB r=1;
@@ -40,7 +36,9 @@ namespace Grid {
      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
      // override multiply
-      void multiply(const LatticeFermion &in, LatticeFermion &out);
+      virtual void M    (const LatticeFermion &in, LatticeFermion &out);
      virtual void Mdag (const LatticeFermion &in, LatticeFermion &out);
      virtual void MdagM(const LatticeFermion &in, LatticeFermion &out);
      // non-hermitian hopping term; half cb or both
      void Dhop(const LatticeFermion &in, LatticeFermion &out);