Added single threaded version of the derivative for the Ls vectorised DWF

2025-11-21 06:59:32 +00:00 · 2016-12-06 16:31:13 +00:00
parent 01480da0a8
commit b812d5e39c
11 changed files with 307 additions and 135 deletions
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -99,7 +99,7 @@ public:
    virtual int oIndex(std::vector<int> &coor)
    {
        int idx=0;
-	// Works with either global or local coordinates
+        // Works with either global or local coordinates
        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
        return idx;
    }
@@ -146,15 +146,15 @@ public:
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
-	for(int d=0;d<_ndimension;d++){
-	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
-	}
-	permute_type = RotateBit; // How to specify distance; this is not just direction.
-	return permute_type;
+        for(int d=0;d<_ndimension;d++){
+          if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+        }
+        permute_type = RotateBit; // How to specify distance; this is not just direction.
+        return permute_type;
      }

      for(int d=_ndimension-1;d>dimension;d--){
-	if (_simd_layout[d]>1 ) permute_type++;
+        if (_simd_layout[d]>1 ) permute_type++;
      }
      return permute_type;
    }
@@ -174,6 +174,22 @@ public:
    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};

+    ////////////////////////////////////////////////////////////////
+    // Print decomposition
+    ////////////////////////////////////////////////////////////////
+
+   	void show_decomposition(){
+   		std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
+   		std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
+   	        std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
+                std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
+   		std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
+   		std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
+   		std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;	
+   		std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
+   		std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;		
+   	} 
+
    ////////////////////////////////////////////////////////////////
    // Global addressing
    ////////////////////////////////////////////////////////////////
@@ -187,8 +203,8 @@ public:
      gidx=0;
      int mult=1;
      for(int mu=0;mu<_ndimension;mu++) {
-	gidx+=mult*gcoor[mu];
-	mult*=_gdimensions[mu];
+        gidx+=mult*gcoor[mu];
+        mult*=_gdimensions[mu];
      }
    }
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@@ -196,9 +212,9 @@ public:
      pcoor.resize(_ndimension);
      lcoor.resize(_ndimension);
      for(int mu=0;mu<_ndimension;mu++){
-	int _fld  = _fdimensions[mu]/_processors[mu];
-	pcoor[mu] = gcoor[mu]/_fld;
-	lcoor[mu] = gcoor[mu]%_fld;
+        int _fld  = _fdimensions[mu]/_processors[mu];
+        pcoor[mu] = gcoor[mu]/_fld;
+        lcoor[mu] = gcoor[mu]%_fld;
      }
    }
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@@ -210,9 +226,9 @@ public:

      std::vector<int> cblcoor(lcoor);
      for(int d=0;d<cblcoor.size();d++){
-	if( this->CheckerBoarded(d) ) {
-	  cblcoor[d] = lcoor[d]/2;
-	}
+        if( this->CheckerBoarded(d) ) {
+          cblcoor[d] = lcoor[d]/2;
+        }
      }

      i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
@@ -238,7 +254,7 @@ public:
    {
      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
      if(CheckerBoarded(0)){
-	fcoor[0] = fcoor[0]*2+cb;
+        fcoor[0] = fcoor[0]*2+cb;
      }
    }
    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -6,8 +6,8 @@

    Copyright (C) 2015

-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -67,24 +67,42 @@ namespace Grid {
    return multiplicity;
  }

+
+
+    inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+  {
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+    // assumes that the higher dimensions are not using more processors
+    // all further divisions are local
+    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+    
+
+    // then divide the number of local sites
+    // check that the total number of sims agree, meanse the iSites are the same
+    assert(fine->Nsimd() == coarse->Nsimd());
+
+    // check that the two grids divide cleanly
+    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+
+    return fine->lSites() / coarse->lSites();
+  }
+
  // Wrap seed_seq to give common interface with random_device
  class fixedSeed {
  public:
-
    typedef std::seed_seq::result_type result_type;
-
    std::seed_seq src;
    
    fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};

    result_type operator () (void){
-
      std::vector<result_type> list(1);
-
      src.generate(list.begin(),list.end());
-
      return list[0];
-
    }

  };
@@ -252,24 +270,30 @@ namespace Grid {
  };

  class GridParallelRNG : public GridRNGbase {
+
+    double _time_counter;
+
  public:

    GridBase *_grid;
-    int _vol;
+    unsigned int _vol;

    int generator_idx(int os,int is){
      return is*_grid->oSites()+os;
    }

    GridParallelRNG(GridBase *grid) : GridRNGbase() {
-      _grid=grid;
-      _vol =_grid->iSites()*_grid->oSites();
+      _grid = grid;
+      _vol  =_grid->iSites()*_grid->oSites();

      _generators.resize(_vol);
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
-      _seeded=0;
+      _seeded = 0;
+
+      _time_counter = 0.0;
+
    }


@@ -325,37 +349,36 @@ namespace Grid {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;

-      int multiplicity = RNGfillable(_grid, l._grid);
+      double inner_time_counter = usecond();

-      int Nsimd = _grid->Nsimd();
-      int osites = _grid->oSites();
+      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
+
+      int Nsimd = _grid->Nsimd();// guaranteed to be the same for l._grid too
+      int osites = _grid->oSites();// guaranteed to be <= l._grid->oSites() by a factor multiplicity
      int words = sizeof(scalar_object) / sizeof(scalar_type);

      PARALLEL_FOR_LOOP
      for (int ss = 0; ss < osites; ss++) {

        std::vector<scalar_object> buf(Nsimd);
-        for (int m = 0; m < multiplicity;
-             m++) {  // Draw from same generator multiplicity times
+        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times

-          int sm = multiplicity * ss +
-                   m;  // Maps the generator site to the fine site
+          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site

          for (int si = 0; si < Nsimd; si++) {
            
            int gdx = generator_idx(ss, si);  // index of generator state
            scalar_type *pointer = (scalar_type *)&buf[si];
            dist[gdx].reset();
-            for (int idx = 0; idx < words; idx++) {
-              
+            for (int idx = 0; idx < words; idx++) 
              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
-            }
          }
-
          // merge into SIMD lanes
          merge(l._odata[sm], buf);
        }
      }
+
+      _time_counter += usecond()- inner_time_counter;
    };

    void SeedRandomDevice(void) {
@@ -366,6 +389,12 @@ namespace Grid {
      fixedSeed src(seeds);
      Seed(src);
    }
+
+    void Report(){
+      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+    }
+
+
  };

  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -44,14 +44,14 @@ namespace QCD {
  // Ultimately need Impl to always define types where XXX is opaque
  //
  //    typedef typename XXX               Simd;
-  //    typedef typename XXX     GaugeLinkField;	
+  //    typedef typename XXX     GaugeLinkField;        
  //    typedef typename XXX         GaugeField;
  //    typedef typename XXX      GaugeActField;
  //    typedef typename XXX       FermionField;
  //    typedef typename XXX  DoubledGaugeField;
  //    typedef typename XXX         SiteSpinor;
-  //    typedef typename XXX     SiteHalfSpinor;	
-  //    typedef typename XXX         Compressor;	
+  //    typedef typename XXX     SiteHalfSpinor;        
+  //    typedef typename XXX         Compressor;        
  //
  // and Methods:
  //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
@@ -94,17 +94,17 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////
  
 #define INHERIT_FIMPL_TYPES(Impl)\
-  typedef typename Impl::FermionField           FermionField;		\
-  typedef typename Impl::DoubledGaugeField DoubledGaugeField;		\
-  typedef typename Impl::SiteSpinor               SiteSpinor;		\
-  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
-  typedef typename Impl::Compressor               Compressor;		\
-  typedef typename Impl::StencilImpl             StencilImpl;		\
-  typedef typename Impl::ImplParams ImplParams;				\
+  typedef typename Impl::FermionField           FermionField;           \
+  typedef typename Impl::DoubledGaugeField DoubledGaugeField;           \
+  typedef typename Impl::SiteSpinor               SiteSpinor;           \
+  typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;           \
+  typedef typename Impl::Compressor               Compressor;           \
+  typedef typename Impl::StencilImpl             StencilImpl;           \
+  typedef typename Impl::ImplParams ImplParams;                         \
  typedef typename Impl::Coeff_t       Coeff_t;
  
 #define INHERIT_IMPL_TYPES(Base) \
-  INHERIT_GIMPL_TYPES(Base)	 \
+  INHERIT_GIMPL_TYPES(Base)      \
  INHERIT_FIMPL_TYPES(Base)
  
  /////////////////////////////////////////////////////////////////////////////
@@ -148,11 +148,11 @@ namespace QCD {
    bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
      
    inline void multLink(SiteHalfSpinor &phi,
-			 const SiteDoubledGaugeField &U,
-			 const SiteHalfSpinor &chi,
-			 int mu,
-			 StencilEntry *SE,
-			 StencilImpl &St) {
+                         const SiteDoubledGaugeField &U,
+                         const SiteHalfSpinor &chi,
+                         int mu,
+                         StencilEntry *SE,
+                         StencilImpl &St) {
      mult(&phi(), &U(mu), &chi());
    }
      
@@ -162,16 +162,16 @@ namespace QCD {
    }
      
    inline void DoubleStore(GridBase *GaugeGrid,
-			    DoubledGaugeField &Uds,
-			    const GaugeField &Umu) {
+                            DoubledGaugeField &Uds,
+                            const GaugeField &Umu) {
      conformable(Uds._grid, GaugeGrid);
      conformable(Umu._grid, GaugeGrid);
      GaugeLinkField U(GaugeGrid);
      for (int mu = 0; mu < Nd; mu++) {
-	U = PeekIndex<LorentzIndex>(Umu, mu);
-	PokeIndex<LorentzIndex>(Uds, U, mu);
-	U = adj(Cshift(U, mu, -1));
-	PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+        U = PeekIndex<LorentzIndex>(Umu, mu);
+        PokeIndex<LorentzIndex>(Uds, U, mu);
+        U = adj(Cshift(U, mu, -1));
+        PokeIndex<LorentzIndex>(Uds, U, mu + 4);
      }
    }

@@ -189,11 +189,11 @@ namespace QCD {
      
      PARALLEL_FOR_LOOP
      for(int sss=0;sss<tmp._grid->oSites();sss++){
-	int sU=sss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
-	}
+        int sU=sss;
+        for(int s=0;s<Ls;s++){
+          int sF = s+Ls*sU;
+          tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
+        }
      }
      PokeIndex<LorentzIndex>(mat,tmp,mu);
      
@@ -248,12 +248,12 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
  }

  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-		       StencilImpl &St) {
+                       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+                       StencilImpl &St) {
    SiteGaugeLink UU;
    for (int i = 0; i < Nrepresentation; i++) {
      for (int j = 0; j < Nrepresentation; j++) {
-	vsplat(UU()()(i, j), U(mu)()(i, j));
+        vsplat(UU()()(i, j), U(mu)()(i, j));
      }
    }
    mult(&phi(), &UU(), &chi());
@@ -290,10 +290,40 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
  {
    assert(0);
  }
-      
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
-  {
-	assert(0);
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
+    int LLs = Btilde._grid->_rdimensions[0];
+    conformable(Atilde._grid,Btilde._grid);
+    GaugeLinkField tmp(mat._grid); 
+    tmp = zero;
+    typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
+    std::vector<typename result_type::scalar_object> v_scalar_object(Btilde._grid->Nsimd());
+
+    PARALLEL_FOR_LOOP
+    for (int sss = 0; sss < tmp._grid->oSites(); sss++) {
+      std::vector<int> ocoor;
+      tmp._grid->oCoorFromOindex(ocoor,sss); 
+      for (int si = 0; si < tmp._grid->iSites(); si++){
+        typename result_type::scalar_object scalar_object;
+        scalar_object = zero;
+        std::vector<int> local_coor(tmp._grid->Nd());      
+        std::vector<int> icoor;  
+        tmp._grid->iCoorFromIindex(icoor,si);
+        for (int i = 0; i < tmp._grid->Nd(); i++) local_coor[i] = ocoor[i] + tmp._grid->_rdimensions[i]*icoor[i];
+        
+        for (int s = 0; s < LLs; s++) {
+          std::vector<int> slocal_coor(Btilde._grid->Nd());
+          slocal_coor[0] = s;
+          for (int s4d = 1; s4d< Btilde._grid->Nd(); s4d++) slocal_coor[s4d] = local_coor[s4d-1];
+          int sF = Btilde._grid->oIndexReduced(slocal_coor);  
+          assert(sF < Btilde._grid->oSites());
+          extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), v_scalar_object); 
+          for (int sv = 0; sv < v_scalar_object.size(); sv++) scalar_object += v_scalar_object[sv];  // sum across the 5d dimension
+        }
+        tmp._odata[sss].putlane(scalar_object, si);
+      }
+    }
+    PokeIndex<LorentzIndex>(mat, tmp, mu);
  }
 };
    
@@ -339,19 +369,19 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 // provide the multiply by link that is differentiated between Gparity (with
 // flavour index) and non-Gparity
 inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
-		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
-		      StencilImpl &St) {
+                      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
+                      StencilImpl &St) {

  typedef SiteHalfSpinor vobj;
   typedef typename SiteHalfSpinor::scalar_object sobj;
-	
+        
   vobj vtmp;
   sobj stmp;
-	
+        
   GridBase *grid = St._grid;
-	
+        
   const int Nsimd = grid->Nsimd();
-	
+        
   int direction = St._directions[mu];
   int distance = St._distances[mu];
   int ptype = St._permute_type[mu];
@@ -359,13 +389,13 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   
   // Fixme X.Y.Z.T hardcode in stencil
   int mmu = mu % Nd;
-	
+        
   // assert our assumptions
   assert((distance == 1) || (distance == -1));  // nearest neighbour stencil hard code
   assert((sl == 1) || (sl == 2));
   
   std::vector<int> icoor;
-	
+        
   if ( SE->_around_the_world && Params.twists[mmu] ) {

     if ( sl == 2 ) {
@@ -375,25 +405,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
       extract(chi,vals);
       for(int s=0;s<Nsimd;s++){

-	 grid->iCoorFromIindex(icoor,s);
-	      
-	 assert((icoor[direction]==0)||(icoor[direction]==1));
-	      
-	 int permute_lane;
-	 if ( distance == 1) {
-	   permute_lane = icoor[direction]?1:0;
-	 } else {
-	   permute_lane = icoor[direction]?0:1;
-	 }
-	      
-	 if ( permute_lane ) { 
-	   stmp(0) = vals[s](1);
-	   stmp(1) = vals[s](0);
-	   vals[s] = stmp;
-	      }
+         grid->iCoorFromIindex(icoor,s);
+              
+         assert((icoor[direction]==0)||(icoor[direction]==1));
+              
+         int permute_lane;
+         if ( distance == 1) {
+           permute_lane = icoor[direction]?1:0;
+         } else {
+           permute_lane = icoor[direction]?0:1;
+         }
+              
+         if ( permute_lane ) { 
+           stmp(0) = vals[s](1);
+           stmp(1) = vals[s](0);
+           vals[s] = stmp;
+              }
       }
       merge(vtmp,vals);
-	    
+            
     } else { 
       vtmp(0) = chi(1);
       vtmp(1) = chi(0);
@@ -418,11 +448,11 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   GaugeLinkField Uconj(GaugeGrid);
   
   Lattice<iScalar<vInteger> > coor(GaugeGrid);
-	
+        
   for(int mu=0;mu<Nd;mu++){
-	  
+          
     LatticeCoordinate(coor,mu);
-	  
+          
     U     = PeekIndex<LorentzIndex>(Umu,mu);
     Uconj = conjugate(U);
     
@@ -431,13 +461,13 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
     if ( Params.twists[mu] ) { 
       Uconj = where(coor==neglink,-Uconj,Uconj);
     }
-	  
+          
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu) = U[ss]();
       Uds[ss](1)(mu) = Uconj[ss]();
     }
-	  
+          
     U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
     Uconj = adj(Cshift(Uconj,mu,-1));
 
@@ -445,22 +475,22 @@ PARALLEL_FOR_LOOP
     if ( Params.twists[mu] ) { 
       Utmp = where(coor==0,Uconj,Utmp);
     }
-	  
+          
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu+4) = Utmp[ss]();
     }
-	  
+          
     Utmp = Uconj;
     if ( Params.twists[mu] ) { 
       Utmp = where(coor==0,U,Utmp);
     }
-	  
+          
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](1)(mu+4) = Utmp[ss]();
     }
-	  
+          
   }
 }
      
@@ -482,7 +512,7 @@ PARALLEL_FOR_LOOP
 inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {

   int Ls = Btilde._grid->_fdimensions[0];
-	
+        
   GaugeLinkField tmp(mat._grid);
   tmp = zero;
 PARALLEL_FOR_LOOP
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -271,11 +271,14 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  assert(dirdisp<=7);
  assert(dirdisp>=0);

+  int LLs = out._grid->_rdimensions[0];
+
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      int sU=ss;
-      int sF = s+Ls*sU; 
+    int sU=ss;
+    for(int s=0;s<LLs;s++){
+      int sF = s+LLs*sU; 
+      assert(sF < out._grid->oSites());
      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
    }
  }
@@ -305,6 +308,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
  DerivCommTime+=usecond();

  Atilde=A;
+  int LLs = B._grid->_rdimensions[0];
+

  DerivComputeTime-=usecond();
  for (int mu = 0; mu < Nd; mu++) {
@@ -321,20 +326,18 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    DerivDhopComputeTime -= usecond();
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < U._grid->oSites(); sss++) {
-      for (int s = 0; s < Ls; s++) {
-        int sU = sss;
-        int sF = s + Ls * sU;
-
+      int sU = sss;
+      for (int s = 0; s < LLs; s++) {
+        int sF = s + LLs * sU;
        assert(sF < B._grid->oSites());
        assert(sU < U._grid->oSites());

        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
-
-        ////////////////////////////
-        // spin trace outer product
-        ////////////////////////////
      }
    }
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
    DerivDhopComputeTime += usecond();
    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
  }
@@ -349,7 +352,7 @@ void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
 {
  conformable(A._grid,FermionGrid());  
  conformable(A._grid,B._grid);
-  conformable(GaugeGrid(),mat._grid);
+  //conformable(GaugeGrid(),mat._grid);

  mat.checkerboard = A.checkerboard;

@@ -363,7 +366,7 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 					int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
-  conformable(GaugeRedBlackGrid(),mat._grid);
+  //conformable(GaugeRedBlackGrid(),mat._grid);
  conformable(A._grid,B._grid);

  assert(B.checkerboard==Odd);
@@ -381,7 +384,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 					int dag)
 {
  conformable(A._grid,FermionRedBlackGrid());
-  conformable(GaugeRedBlackGrid(),mat._grid);
+  //conformable(GaugeRedBlackGrid(),mat._grid);
  conformable(A._grid,B._grid);

  assert(B.checkerboard==Even);
--- a/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
+++ b/lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
@@ -68,8 +68,14 @@ namespace Grid{
 	  assert(U.checkerboard==Odd);
 	  assert(V.checkerboard==U.checkerboard);

-	  GaugeField ForceO(ucbgrid);
-	  GaugeField ForceE(ucbgrid);
+          // NOTE Guido: WE DO NOT WANT TO USE THIS GRID FOR THE FORCE
+          // INHERIT FROM THE Force field
+          //GaugeField ForceO(ucbgrid);
+          //GaugeField ForceE(ucbgrid);
+          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);
+

 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
@@ -110,8 +116,14 @@ namespace Grid{
 	  assert(V.checkerboard==Odd);
 	  assert(V.checkerboard==V.checkerboard);

-	  GaugeField ForceO(ucbgrid);
-	  GaugeField ForceE(ucbgrid);
+          // NOTE Guido: WE DO NOT WANT TO USE THIS GRID FOR THE FORCE
+          // INHERIT FROM THE Force field
+
+	  //GaugeField ForceO(ucbgrid);
+	  //GaugeField ForceE(ucbgrid);
+          GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
+          GaugeField ForceO(forcecb);
+          GaugeField ForceE(forcecb);

 	  //  X^dag Der_oe MeeInv Meo Y
 	  // Use Mooee as nontrivial but gauge field indept
@@ -130,6 +142,8 @@ namespace Grid{
 	  setCheckerboard(Force,ForceE); 
 	  setCheckerboard(Force,ForceO);
 	  Force=-Force;
+
+
 	}

    };
--- a/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -166,7 +166,9 @@ namespace Grid{
 	FermionField  X(NumOp.FermionRedBlackGrid());
 	FermionField  Y(NumOp.FermionRedBlackGrid());

-	GaugeField   force(NumOp.GaugeGrid());	
+	//GaugeField   force(NumOp.GaugeGrid());
+        GaugeField   force(dSdU._grid);
+        conformable(force._grid, dSdU._grid);

 	//Y=Vdag phi
 	//X = (Mdag M)^-1 V^dag phi
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -114,6 +114,7 @@ class Integrator {
    // Fundamental updates, include smearing
    for (int a = 0; a < as[level].actions.size(); ++a) {
      Field force(U._grid);
+      conformable(U._grid, Mom._grid);
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -386,6 +386,19 @@ class Grid_simd {
    }
  }

+  ///////////////////////////////
+  // Getting single lanes
+  ///////////////////////////////
+  inline Scalar_type getlane(int lane) {
+    return ((Scalar_type*)&v)[lane];
+  }
+
+  inline void putlane(const Scalar_type &S, int lane){
+    ((Scalar_type*)&v)[lane] = S;
+  }
+
+
+  
 };  // end of Grid_simd class definition


--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -88,6 +88,21 @@ class iScalar {
    zeroit(*this);
    return *this;
  }
+
+
+  // managing the internal vector structure
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    ret._internal = _internal.getlane(lane);
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    _internal.putlane(s._internal,lane);
+  }
+
+
+  
  friend strong_inline void vstream(iScalar<vtype> &out,
                                    const iScalar<vtype> &in) {
    vstream(out._internal, in._internal);
@@ -226,6 +241,20 @@ class iVector {
    zeroit(*this);
    return *this;
  }
+
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    for (int i = 0; i < N; i++) ret._internal[i] = _internal[i].getlane(lane);
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    for (int i = 0; i < N; i++) _internal[i].putlane(s._internal[i],lane);
+  }
+  
+
+
+  
  friend strong_inline void zeroit(iVector<vtype, N> &that) {
    for (int i = 0; i < N; i++) {
      zeroit(that._internal[i]);
@@ -349,6 +378,25 @@ class iMatrix {
    return *this;
  }

+
+  strong_inline scalar_object getlane(int lane){
+    scalar_object ret;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < N; j++) {
+	ret._internal[i][j] = _internal[i][j].getlane(lane);
+      }
+    }
+    return ret;
+  }
+
+  strong_inline void putlane(scalar_object &s, int lane){
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) _internal[i][j].putlane(s._internal[i][j],lane);
+  }
+  
+
+  
+
  friend strong_inline void zeroit(iMatrix<vtype,N> &that){
    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
--- a/tests/hmc/Test_hmc_EODWFRatio_Binary.cc
+++ b/tests/hmc/Test_hmc_EODWFRatio_Binary.cc
@@ -62,31 +62,45 @@ public:
  void BuildTheAction(int argc, char **argv)

  {
-    typedef WilsonImplR ImplPolicy;
-    typedef ScaledShamirFermionR FermionAction;
+    //typedef WilsonImplR ImplPolicy;
+    typedef DomainWallVec5dImplR ImplPolicy;
+    typedef ScaledShamirFermion<ImplPolicy> FermionAction;
    typedef typename FermionAction::FermionField FermionField;

-    const int Ls = 12;
+    const int Ls = 8;

    UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);

+
+    GridCartesian* sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+    GridRedBlackCartesian* sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+
+    
+    FGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+    FrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+    
+    /*
    FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
    FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+    */

    // temporarily need a gauge field
    LatticeGaugeField  U(UGrid);

    // Gauge action
    double beta = 4.0;
-    IwasakiGaugeActionR Iaction(beta);
+    WilsonGaugeActionR Iaction(beta);

    Real mass = 0.04;
    Real pv   = 1.0;
    RealD M5  = 1.5;
    RealD scale = 2.0;
-    FermionAction DenOp(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,scale);
-    FermionAction NumOp(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pv,M5,scale);
+    FermionAction DenOp(U,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,mass,M5,scale);
+    FermionAction NumOp(U,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,pv,M5,scale);
+
+    std::cout << GridLogMessage << "Frb Osites: " << FrbGrid->oSites() << std::endl;
+    std::cout << GridLogMessage << "sUGrid Osites: " << sUGrid->oSites() << std::endl;

    double StoppingCondition = 1.0e-8;
    double MaxCGIterations = 10000;
@@ -94,7 +108,7 @@ public:
    TwoFlavourEvenOddRatioPseudoFermionAction<ImplPolicy> Nf2(NumOp, DenOp,CG,CG);

    // Set smearing (true/false), default: false
-    Nf2.is_smeared = true;
+    Nf2.is_smeared = false;

    // Collect actions
    // here an example of 2 level integration
@@ -154,7 +168,7 @@ int main(int argc, char **argv) {

  // Seeds for the random number generators
  std::vector<int> SerSeed({1, 2, 3, 4, 5});
-  std::vector<int> ParSeed({6, 7, 8, 9, 10});
+  std::vector<int> ParSeed({6, 7, 8, 9, 5});
  TheHMC.RNGSeeds(SerSeed, ParSeed);

  TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
--- a/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge_Binary.cc
@@ -156,4 +156,6 @@ int main(int argc, char **argv) {
  TheHMC.MDparameters.set(10, 1.0);// MDsteps, traj length

  TheHMC.BuildTheAction(argc, argv);
+
+  Grid_finalize();
 }