From b812d5e39c57d14f3cc4c769ed1f72cf90691cb4 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Tue, 6 Dec 2016 16:31:13 +0000
Subject: [PATCH] Added single threaded version of the derivative for the Ls
 vectorised DWF

---
 lib/cartesian/Cartesian_base.h                |  48 +++--
 lib/lattice/Lattice_rng.h                     |  75 +++++---
 lib/qcd/action/fermion/FermionOperatorImpl.h  | 168 +++++++++++-------
 lib/qcd/action/fermion/WilsonFermion5D.cc     |  31 ++--
 .../EvenOddSchurDifferentiable.h              |  22 ++-
 .../pseudofermion/TwoFlavourEvenOddRatio.h    |   4 +-
 lib/qcd/hmc/integrators/Integrator.h          |   1 +
 lib/simd/Grid_vector_types.h                  |  13 ++
 lib/tensors/Tensor_class.h                    |  48 +++++
 tests/hmc/Test_hmc_EODWFRatio_Binary.cc       |  30 +++-
 .../hmc/Test_hmc_WilsonFermionGauge_Binary.cc |   2 +
 11 files changed, 307 insertions(+), 135 deletions(-)
diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h
index 72b21ee3..334914e9 100644
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -99,7 +99,7 @@ public:
     virtual int oIndex(std::vector<int> &coor)
     {
         int idx=0;
-	// Works with either global or local coordinates
+        // Works with either global or local coordinates
         for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
         return idx;
     }
@@ -146,15 +146,15 @@ public:
       // Distance should be either 0,1,2..
       //
       if ( _simd_layout[dimension] > 2 ) { 
-	for(int d=0;d<_ndimension;d++){
-	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
-	}
-	permute_type = RotateBit; // How to specify distance; this is not just direction.
-	return permute_type;
+        for(int d=0;d<_ndimension;d++){
+          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+        }
+        permute_type = RotateBit; // How to specify distance; this is not just direction.
+        return permute_type;
       }
 
       for(int d=_ndimension-1;d>dimension;d--){
-	if (_simd_layout[d]>1 ) permute_type++;
+        if (_simd_layout[d]>1 ) permute_type++;
       }
       return permute_type;
     }
@@ -174,6 +174,22 @@ public:
     inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
     inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
 
+    ////////////////////////////////////////////////////////////////
+    // Print decomposition
+    ////////////////////////////////////////////////////////////////
+
+   	void show_decomposition(){
+   		std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
+   		std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
+   	        std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
+                std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
+   		std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
+   		std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
+   		std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;	
+   		std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
+   		std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;		
+   	} 
+
     ////////////////////////////////////////////////////////////////
     // Global addressing
     ////////////////////////////////////////////////////////////////
@@ -187,8 +203,8 @@ public:
       gidx=0;
       int mult=1;
       for(int mu=0;mu<_ndimension;mu++) {
-	gidx+=mult*gcoor[mu];
-	mult*=_gdimensions[mu];
+        gidx+=mult*gcoor[mu];
+        mult*=_gdimensions[mu];
       }
     }
     void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@@ -196,9 +212,9 @@ public:
       pcoor.resize(_ndimension);
       lcoor.resize(_ndimension);
       for(int mu=0;mu<_ndimension;mu++){
-	int _fld  = _fdimensions[mu]/_processors[mu];
-	pcoor[mu] = gcoor[mu]/_fld;
-	lcoor[mu] = gcoor[mu]%_fld;
+        int _fld  = _fdimensions[mu]/_processors[mu];
+        pcoor[mu] = gcoor[mu]/_fld;
+        lcoor[mu] = gcoor[mu]%_fld;
       }
     }
     void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@@ -210,9 +226,9 @@ public:
 
       std::vector<int> cblcoor(lcoor);
       for(int d=0;d<cblcoor.size();d++){
-	if( this->CheckerBoarded(d) ) {
-	  cblcoor[d] = lcoor[d]/2;
-	}
+        if( this->CheckerBoarded(d) ) {
+          cblcoor[d] = lcoor[d]/2;
+        }
       }
 
       i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
@@ -238,7 +254,7 @@ public:
     {
       RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
       if(CheckerBoarded(0)){
-	fcoor[0] = fcoor[0]*2+cb;
+        fcoor[0] = fcoor[0]*2+cb;
       }
     }
     void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
diff --git a/lib/lattice/Lattice_rng.h b/lib/lattice/Lattice_rng.h
index cd73b462..c5607ed8 100644
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -6,8 +6,8 @@
 
     Copyright (C) 2015
 
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -67,24 +67,42 @@ namespace Grid {
     return multiplicity;
   }
 
+
+
+    inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+  {
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+    // assumes that the higher dimensions are not using more processors
+    // all further divisions are local
+    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+    
+
+    // then divide the number of local sites
+    // check that the total number of sims agree, meanse the iSites are the same
+    assert(fine->Nsimd() == coarse->Nsimd());
+
+    // check that the two grids divide cleanly
+    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+
+    return fine->lSites() / coarse->lSites();
+  }
+
   // Wrap seed_seq to give common interface with random_device
   class fixedSeed {
   public:
-
     typedef std::seed_seq::result_type result_type;
-
     std::seed_seq src;
     
     fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
 
     result_type operator () (void){
-
       std::vector<result_type> list(1);
-
       src.generate(list.begin(),list.end());
-
       return list[0];
-
     }
 
   };
@@ -252,24 +270,30 @@ namespace Grid {
   };
 
   class GridParallelRNG : public GridRNGbase {
+
+    double _time_counter;
+
   public:
 
     GridBase *_grid;
-    int _vol;
+    unsigned int _vol;
 
     int generator_idx(int os,int is){
       return is*_grid->oSites()+os;
     }
 
     GridParallelRNG(GridBase *grid) : GridRNGbase() {
-      _grid=grid;
-      _vol =_grid->iSites()*_grid->oSites();
+      _grid = grid;
+      _vol  =_grid->iSites()*_grid->oSites();
 
       _generators.resize(_vol);
       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
-      _seeded=0;
+      _seeded = 0;
+
+      _time_counter = 0.0;
+
     }
 
 
@@ -325,37 +349,36 @@ namespace Grid {
       typedef typename vobj::scalar_type scalar_type;
       typedef typename vobj::vector_type vector_type;
 
-      int multiplicity = RNGfillable(_grid, l._grid);
+      double inner_time_counter = usecond();
 
-      int Nsimd = _grid->Nsimd();
-      int osites = _grid->oSites();
+      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
+
+      int Nsimd = _grid->Nsimd();// guaranteed to be the same for l._grid too
+      int osites = _grid->oSites();// guaranteed to be <= l._grid->oSites() by a factor multiplicity
       int words = sizeof(scalar_object) / sizeof(scalar_type);
 
       PARALLEL_FOR_LOOP
       for (int ss = 0; ss < osites; ss++) {
 
         std::vector<scalar_object> buf(Nsimd);
-        for (int m = 0; m < multiplicity;
-             m++) {  // Draw from same generator multiplicity times
+        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
 
-          int sm = multiplicity * ss +
-                   m;  // Maps the generator site to the fine site
+          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
 
           for (int si = 0; si < Nsimd; si++) {
             
             int gdx = generator_idx(ss, si);  // index of generator state
             scalar_type *pointer = (scalar_type *)&buf[si];
             dist[gdx].reset();
-            for (int idx = 0; idx < words; idx++) {
-              
+            for (int idx = 0; idx < words; idx++) 
               fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
-            }
           }
-
           // merge into SIMD lanes
           merge(l._odata[sm], buf);
         }
       }
+
+      _time_counter += usecond()- inner_time_counter;
     };
 
     void SeedRandomDevice(void) {
@@ -366,6 +389,12 @@ namespace Grid {
       fixedSeed src(seeds);
       Seed(src);
     }
+
+    void Report(){
+      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+    }
+
+
   };
 
   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 0800dea6..be4f629f 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -44,14 +44,14 @@ namespace QCD {
   // Ultimately need Impl to always define types where XXX is opaque
   //
   //    typedef typename XXX               Simd;
-  //    typedef typename XXX     GaugeLinkField;	
+  //    typedef typename XXX     GaugeLinkField;        
   //    typedef typename XXX         GaugeField;
   //    typedef typename XXX      GaugeActField;
   //    typedef typename XXX       FermionField;
   //    typedef typename XXX  DoubledGaugeField;
   //    typedef typename XXX         SiteSpinor;
-  //    typedef typename XXX     SiteHalfSpinor;	
-  //    typedef typename XXX         Compressor;	
+  //    typedef typename XXX     SiteHalfSpinor;        
+  //    typedef typename XXX         Compressor;        
   //
   // and Methods:
   //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
@@ -94,17 +94,17 @@ namespace QCD {
   ////////////////////////////////////////////////////////////////////////
   
 #define INHERIT_FIMPL_TYPES(Impl)\
-  typedef typename Impl::FermionField           FermionField;		\
-  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
-  typedef typename Impl::SiteSpinor               SiteSpinor;		\
-  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
-  typedef typename Impl::Compressor               Compressor;		\
-  typedef typename Impl::StencilImpl             StencilImpl;		\
-  typedef typename Impl::ImplParams ImplParams;				\
+  typedef typename Impl::FermionField           FermionField;           \
+  typedef typename Impl::DoubledGaugeField DoubledGaugeField;           \
+  typedef typename Impl::SiteSpinor               SiteSpinor;           \
+  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;           \
+  typedef typename Impl::Compressor               Compressor;           \
+  typedef typename Impl::StencilImpl             StencilImpl;           \
+  typedef typename Impl::ImplParams ImplParams;                         \
   typedef typename Impl::Coeff_t       Coeff_t;
   
 #define INHERIT_IMPL_TYPES(Base) \
-  INHERIT_GIMPL_TYPES(Base)	 \
+  INHERIT_GIMPL_TYPES(Base)      \
   INHERIT_FIMPL_TYPES(Base)
   
   /////////////////////////////////////////////////////////////////////////////
@@ -148,11 +148,11 @@ namespace QCD {
     bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
       
     inline void multLink(SiteHalfSpinor &phi,
-			 const SiteDoubledGaugeField &U,
-			 const SiteHalfSpinor &chi,
-			 int mu,
-			 StencilEntry *SE,
-			 StencilImpl &St) {
+                         const SiteDoubledGaugeField &U,
+                         const SiteHalfSpinor &chi,
+                         int mu,
+                         StencilEntry *SE,
+                         StencilImpl &St) {
       mult(&phi(), &U(mu), &chi());
     }
       
@@ -162,16 +162,16 @@ namespace QCD {
     }
       
     inline void DoubleStore(GridBase *GaugeGrid,
-			    DoubledGaugeField &Uds,
-			    const GaugeField &Umu) {
+                            DoubledGaugeField &Uds,
+                            const GaugeField &Umu) {
       conformable(Uds._grid, GaugeGrid);
       conformable(Umu._grid, GaugeGrid);
       GaugeLinkField U(GaugeGrid);
       for (int mu = 0; mu < Nd; mu++) {
-	U = PeekIndex<LorentzIndex>(Umu, mu);
-	PokeIndex<LorentzIndex>(Uds, U, mu);
-	U = adj(Cshift(U, mu, -1));
-	PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+        U = PeekIndex<LorentzIndex>(Umu, mu);
+        PokeIndex<LorentzIndex>(Uds, U, mu);
+        U = adj(Cshift(U, mu, -1));
+        PokeIndex<LorentzIndex>(Uds, U, mu + 4);
       }
     }
 
@@ -189,11 +189,11 @@ namespace QCD {
       
       PARALLEL_FOR_LOOP
       for(int sss=0;sss<tmp._grid->oSites();sss++){
-	int sU=sss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
-	}
+        int sU=sss;
+        for(int s=0;s<Ls;s++){
+          int sF = s+Ls*sU;
+          tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+        }
       }
       PokeIndex<LorentzIndex>(mat,tmp,mu);
       
@@ -248,12 +248,12 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
   }
 
   inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-		       StencilImpl &St) {
+                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+                       StencilImpl &St) {
     SiteGaugeLink UU;
     for (int i = 0; i < Nrepresentation; i++) {
       for (int j = 0; j < Nrepresentation; j++) {
-	vsplat(UU()()(i, j), U(mu)()(i, j));
+        vsplat(UU()()(i, j), U(mu)()(i, j));
       }
     }
     mult(&phi(), &UU(), &chi());
@@ -290,10 +290,40 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
   {
     assert(0);
   }
-      
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
-  {
-	assert(0);
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+    int LLs = Btilde._grid->_rdimensions[0];
+    conformable(Atilde._grid,Btilde._grid);
+    GaugeLinkField tmp(mat._grid); 
+    tmp = zero;
+    typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+    std::vector<typename result_type::scalar_object> v_scalar_object(Btilde._grid->Nsimd());
+
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < tmp._grid->oSites(); sss++) {
+      std::vector<int> ocoor;
+      tmp._grid->oCoorFromOindex(ocoor,sss); 
+      for (int si = 0; si < tmp._grid->iSites(); si++){
+        typename result_type::scalar_object scalar_object;
+        scalar_object = zero;
+        std::vector<int> local_coor(tmp._grid->Nd());      
+        std::vector<int> icoor;  
+        tmp._grid->iCoorFromIindex(icoor,si);
+        for (int i = 0; i < tmp._grid->Nd(); i++) local_coor[i] = ocoor[i] + tmp._grid->_rdimensions[i]*icoor[i];
+        
+        for (int s = 0; s < LLs; s++) {
+          std::vector<int> slocal_coor(Btilde._grid->Nd());
+          slocal_coor[0] = s;
+          for (int s4d = 1; s4d< Btilde._grid->Nd(); s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+          int sF = Btilde._grid->oIndexReduced(slocal_coor);  
+          assert(sF < Btilde._grid->oSites());
+          extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), v_scalar_object); 
+          for (int sv = 0; sv < v_scalar_object.size(); sv++) scalar_object += v_scalar_object[sv];  // sum across the 5d dimension
+        }
+        tmp._odata[sss].putlane(scalar_object, si);
+      }
+    }
+    PokeIndex<LorentzIndex>(mat, tmp, mu);
   }
 };
     
@@ -339,19 +369,19 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
  // provide the multiply by link that is differentiated between Gparity (with
  // flavour index) and non-Gparity
  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-		      StencilImpl &St) {
+                      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+                      StencilImpl &St) {
 
   typedef SiteHalfSpinor vobj;
    typedef typename SiteHalfSpinor::scalar_object sobj;
-	
+        
    vobj vtmp;
    sobj stmp;
-	
+        
    GridBase *grid = St._grid;
-	
+        
    const int Nsimd = grid->Nsimd();
-	
+        
    int direction = St._directions[mu];
    int distance = St._distances[mu];
    int ptype = St._permute_type[mu];
@@ -359,13 +389,13 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
    
    // Fixme X.Y.Z.T hardcode in stencil
    int mmu = mu % Nd;
-	
+        
    // assert our assumptions
    assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
    assert((sl == 1) || (sl == 2));
    
    std::vector<int> icoor;
-	
+        
    if ( SE->_around_the_world && Params.twists[mmu] ) {
 
      if ( sl == 2 ) {
@@ -375,25 +405,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
        extract(chi,vals);
        for(int s=0;s<Nsimd;s++){
 
-	 grid->iCoorFromIindex(icoor,s);
-	      
-	 assert((icoor[direction]==0)||(icoor[direction]==1));
-	      
-	 int permute_lane;
-	 if ( distance == 1) {
-	   permute_lane = icoor[direction]?1:0;
-	 } else {
-	   permute_lane = icoor[direction]?0:1;
-	 }
-	      
-	 if ( permute_lane ) { 
-	   stmp(0) = vals[s](1);
-	   stmp(1) = vals[s](0);
-	   vals[s] = stmp;
-	      }
+         grid->iCoorFromIindex(icoor,s);
+              
+         assert((icoor[direction]==0)||(icoor[direction]==1));
+              
+         int permute_lane;
+         if ( distance == 1) {
+           permute_lane = icoor[direction]?1:0;
+         } else {
+           permute_lane = icoor[direction]?0:1;
+         }
+              
+         if ( permute_lane ) { 
+           stmp(0) = vals[s](1);
+           stmp(1) = vals[s](0);
+           vals[s] = stmp;
+              }
        }
        merge(vtmp,vals);
-	    
+            
      } else { 
        vtmp(0) = chi(1);
        vtmp(1) = chi(0);
@@ -418,11 +448,11 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
    GaugeLinkField Uconj(GaugeGrid);
    
    Lattice<iScalar<vInteger> > coor(GaugeGrid);
-	
+        
    for(int mu=0;mu<Nd;mu++){
-	  
+          
      LatticeCoordinate(coor,mu);
-	  
+          
      U     = PeekIndex<LorentzIndex>(Umu,mu);
      Uconj = conjugate(U);
      
@@ -431,13 +461,13 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
      if ( Params.twists[mu] ) { 
        Uconj = where(coor==neglink,-Uconj,Uconj);
      }
-	  
+          
 PARALLEL_FOR_LOOP
      for(auto ss=U.begin();ss<U.end();ss++){
        Uds[ss](0)(mu) = U[ss]();
        Uds[ss](1)(mu) = Uconj[ss]();
      }
-	  
+          
      U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
      Uconj = adj(Cshift(Uconj,mu,-1));
  
@@ -445,22 +475,22 @@ PARALLEL_FOR_LOOP
      if ( Params.twists[mu] ) { 
        Utmp = where(coor==0,Uconj,Utmp);
      }
-	  
+          
 PARALLEL_FOR_LOOP
      for(auto ss=U.begin();ss<U.end();ss++){
        Uds[ss](0)(mu+4) = Utmp[ss]();
      }
-	  
+          
      Utmp = Uconj;
      if ( Params.twists[mu] ) { 
        Utmp = where(coor==0,U,Utmp);
      }
-	  
+          
 PARALLEL_FOR_LOOP
      for(auto ss=U.begin();ss<U.end();ss++){
        Uds[ss](1)(mu+4) = Utmp[ss]();
      }
-	  
+          
    }
  }
       
@@ -482,7 +512,7 @@ PARALLEL_FOR_LOOP
  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
 
    int Ls = Btilde._grid->_fdimensions[0];
-	
+        
    GaugeLinkField tmp(mat._grid);
    tmp = zero;
 PARALLEL_FOR_LOOP
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index d2ac96e3..8b8d1e2d 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -271,11 +271,14 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
   assert(dirdisp<=7);
   assert(dirdisp>=0);
 
+  int LLs = out._grid->_rdimensions[0];
+
 PARALLEL_FOR_LOOP
   for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
+    int sU=ss;
+    for(int s=0;s<LLs;s++){
+      int sF = s+LLs*sU; 
+      assert(sF < out._grid->oSites());
       Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
     }
   }
@@ -305,6 +308,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
   DerivCommTime+=usecond();
 
   Atilde=A;
+  int LLs = B._grid->_rdimensions[0];
+
 
   DerivComputeTime-=usecond();
   for (int mu = 0; mu < Nd; mu++) {
@@ -321,20 +326,18 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
     DerivDhopComputeTime -= usecond();
     PARALLEL_FOR_LOOP
     for (int sss = 0; sss < U._grid->oSites(); sss++) {
-      for (int s = 0; s < Ls; s++) {
-        int sU = sss;
-        int sF = s + Ls * sU;
-
+      int sU = sss;
+      for (int s = 0; s < LLs; s++) {
+        int sF = s + LLs * sU;
         assert(sF < B._grid->oSites());
         assert(sU < U._grid->oSites());
 
         Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
-
-        ////////////////////////////
-        // spin trace outer product
-        ////////////////////////////
       }
     }
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
     DerivDhopComputeTime += usecond();
     Impl::InsertForce5D(mat, Btilde, Atilde, mu);
   }
@@ -349,7 +352,7 @@ void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
 {
   conformable(A._grid,FermionGrid());  
   conformable(A._grid,B._grid);
-  conformable(GaugeGrid(),mat._grid);
+  //conformable(GaugeGrid(),mat._grid);
 
   mat.checkerboard = A.checkerboard;
 
@@ -363,7 +366,7 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 					int dag)
 {
   conformable(A._grid,FermionRedBlackGrid());
-  conformable(GaugeRedBlackGrid(),mat._grid);
+  //conformable(GaugeRedBlackGrid(),mat._grid);
   conformable(A._grid,B._grid);
 
   assert(B.checkerboard==Odd);
@@ -381,7 +384,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 					int dag)
 {
   conformable(A._grid,FermionRedBlackGrid());
-  conformable(GaugeRedBlackGrid(),mat._grid);
+  //conformable(GaugeRedBlackGrid(),mat._grid);
   conformable(A._grid,B._grid);
 
   assert(B.checkerboard==Even);
diff --git a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
index 6837bb19..6965fea9 100644
--- a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -68,8 +68,14 @@ namespace Grid{
 	  assert(U.checkerboard==Odd);
 	  assert(V.checkerboard==U.checkerboard);
 
-	  GaugeField ForceO(ucbgrid);
-	  GaugeField ForceE(ucbgrid);
+          // NOTE Guido: WE DO NOT WANT TO USE THIS GRID FOR THE FORCE
+          // INHERIT FROM THE Force field
+          //GaugeField ForceO(ucbgrid);
+          //GaugeField ForceE(ucbgrid);
+          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);
+
 
 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
@@ -110,8 +116,14 @@ namespace Grid{
 	  assert(V.checkerboard==Odd);
 	  assert(V.checkerboard==V.checkerboard);
 
-	  GaugeField ForceO(ucbgrid);
-	  GaugeField ForceE(ucbgrid);
+          // NOTE Guido: WE DO NOT WANT TO USE THIS GRID FOR THE FORCE
+          // INHERIT FROM THE Force field
+
+	  //GaugeField ForceO(ucbgrid);
+	  //GaugeField ForceE(ucbgrid);
+          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);
 
 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
@@ -130,6 +142,8 @@ namespace Grid{
 	  setCheckerboard(Force,ForceE); 
 	  setCheckerboard(Force,ForceO);
 	  Force=-Force;
+
+
 	}
 
     };
diff --git a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
index a8bd9fe7..d7a26695 100644
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -166,7 +166,9 @@ namespace Grid{
 	FermionField  X(NumOp.FermionRedBlackGrid());
 	FermionField  Y(NumOp.FermionRedBlackGrid());
 
-	GaugeField   force(NumOp.GaugeGrid());	
+	//GaugeField   force(NumOp.GaugeGrid());
+        GaugeField   force(dSdU._grid);
+        conformable(force._grid, dSdU._grid);
 
 	//Y=Vdag phi
 	//X = (Mdag M)^-1 V^dag phi
diff --git a/lib/qcd/hmc/integrators/Integrator.h b/lib/qcd/hmc/integrators/Integrator.h
index 6f1e5c45..bc076085 100644
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -114,6 +114,7 @@ class Integrator {
     // Fundamental updates, include smearing
     for (int a = 0; a < as[level].actions.size(); ++a) {
       Field force(U._grid);
+      conformable(U._grid, Mom._grid);
       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta
 
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 42f28b34..fc217cc5 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -386,6 +386,19 @@ class Grid_simd {
     }
   }
 
+  ///////////////////////////////
+  // Getting single lanes
+  ///////////////////////////////
+  inline Scalar_type getlane(int lane) {
+    return ((Scalar_type*)&v)[lane];
+  }
+
+  inline void putlane(const Scalar_type &S, int lane){
+    ((Scalar_type*)&v)[lane] = S;
+  }
+
+
+  
 };  // end of Grid_simd class definition
 
 
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index 916fc09c..cdd57792 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -88,6 +88,21 @@ class iScalar {
     zeroit(*this);
     return *this;
   }
+
+
+  // managing the internal vector structure
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    ret._internal = _internal.getlane(lane);
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    _internal.putlane(s._internal,lane);
+  }
+
+
+  
   friend strong_inline void vstream(iScalar<vtype> &out,
                                     const iScalar<vtype> &in) {
     vstream(out._internal, in._internal);
@@ -226,6 +241,20 @@ class iVector {
     zeroit(*this);
     return *this;
   }
+
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    for (int i = 0; i < N; i++) ret._internal[i] = _internal[i].getlane(lane);
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    for (int i = 0; i < N; i++) _internal[i].putlane(s._internal[i],lane);
+  }
+  
+
+
+  
   friend strong_inline void zeroit(iVector<vtype, N> &that) {
     for (int i = 0; i < N; i++) {
       zeroit(that._internal[i]);
@@ -349,6 +378,25 @@ class iMatrix {
     return *this;
   }
 
+
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < N; j++) {
+	ret._internal[i][j] = _internal[i][j].getlane(lane);
+      }
+    }
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) _internal[i][j].putlane(s._internal[i][j],lane);
+  }
+  
+
+  
+
   friend strong_inline void zeroit(iMatrix<vtype,N> &that){
     for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
diff --git a/tests/hmc/Test_hmc_EODWFRatio_Binary.cc b/tests/hmc/Test_hmc_EODWFRatio_Binary.cc
index 44d4f44b..510de3ed 100644
--- a/tests/hmc/Test_hmc_EODWFRatio_Binary.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio_Binary.cc
@@ -62,31 +62,45 @@ public:
   void BuildTheAction(int argc, char **argv)
 
   {
-    typedef WilsonImplR ImplPolicy;
-    typedef ScaledShamirFermionR FermionAction;
+    //typedef WilsonImplR ImplPolicy;
+    typedef DomainWallVec5dImplR ImplPolicy;
+    typedef ScaledShamirFermion<ImplPolicy> FermionAction;
     typedef typename FermionAction::FermionField FermionField;
 
-    const int Ls = 12;
+    const int Ls = 8;
 
     UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
     UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 
+
+    GridCartesian* sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+    GridRedBlackCartesian* sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+
+    
+    FGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+    FrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+    
+    /*
     FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
     FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+    */
 
     // temporarily need a gauge field
     LatticeGaugeField  U(UGrid);
 
     // Gauge action
     double beta = 4.0;
-    IwasakiGaugeActionR Iaction(beta);
+    WilsonGaugeActionR Iaction(beta);
 
     Real mass = 0.04;
     Real pv   = 1.0;
     RealD M5  = 1.5;
     RealD scale = 2.0;
-    FermionAction DenOp(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,scale);
-    FermionAction NumOp(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pv,M5,scale);
+    FermionAction DenOp(U,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,mass,M5,scale);
+    FermionAction NumOp(U,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,pv,M5,scale);
+
+    std::cout << GridLogMessage << "Frb Osites: " << FrbGrid->oSites() << std::endl;
+    std::cout << GridLogMessage << "sUGrid Osites: " << sUGrid->oSites() << std::endl;
 
     double StoppingCondition = 1.0e-8;
     double MaxCGIterations = 10000;
@@ -94,7 +108,7 @@ public:
     TwoFlavourEvenOddRatioPseudoFermionAction<ImplPolicy> Nf2(NumOp, DenOp,CG,CG);
 
     // Set smearing (true/false), default: false
-    Nf2.is_smeared = true;
+    Nf2.is_smeared = false;
 
     // Collect actions
     // here an example of 2 level integration
@@ -154,7 +168,7 @@ int main(int argc, char **argv) {
 
   // Seeds for the random number generators
   std::vector<int> SerSeed({1, 2, 3, 4, 5});
-  std::vector<int> ParSeed({6, 7, 8, 9, 10});
+  std::vector<int> ParSeed({6, 7, 8, 9, 5});
   TheHMC.RNGSeeds(SerSeed, ParSeed);
 
   TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc b/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc
index 42a903d6..cae53960 100644
--- a/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc
@@ -156,4 +156,6 @@ int main(int argc, char **argv) {
   TheHMC.MDparameters.set(10, 1.0);// MDsteps, traj length
 
   TheHMC.BuildTheAction(argc, argv);
+
+  Grid_finalize();
 }