Build reorg with which I am a bit happier

2026-02-21 22:26:13 +00:00 · 2015-04-18 21:22:50 +01:00
parent a17ce0695b
commit aee6669d0b
41 changed files with 0 additions and 0 deletions
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -0,0 +1,160 @@
+#ifndef GRID_LATTICE_ARITH_H
+#define GRID_LATTICE_ARITH_H
+
+namespace Grid {
+
+  template<class vobj>
+  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
+  {
+    Lattice<vobj> ret(r._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<r._grid->oSites();ss++){
+      ret._odata[ss]= -r._odata[ss];
+    }
+    return ret;
+  }
+  
+  template<class vobj>
+  inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  template<class vobj>
+  inline void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  // Lattice BinOp Lattice,
+  template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    mult(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    add(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    sub(ret,lhs,rhs);
+    return ret;
+  }
+  
+  // Scalar BinOp Lattice ;generate return type
+  template<class left,class right>
+    inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=lhs*rhs._odata[ss];
+    }
+        return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+    {
+      Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=lhs+rhs._odata[ss];
+      }
+        return ret;
+    }
+  template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=lhs-rhs._odata[ss];
+    }
+    return ret;
+  }
+    template<class left,class right>
+      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<lhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs._odata[ss]*rhs;
+      }
+      return ret;
+    }
+    template<class left,class right>
+      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=lhs._odata[ss]-rhs;
+      }
+      return ret;
+    }
+}
+#endif
--- a/lib/lattice/Grid_lattice_conformable.h
+++ b/lib/lattice/Grid_lattice_conformable.h
@@ -0,0 +1,14 @@
+#ifndef GRID_LATTICE_CONFORMABLE_H
+#define GRID_LATTICE_CONFORMABLE_H
+
+namespace Grid {
+
+    template<class obj1,class obj2>
+    void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+    {
+        assert(lhs._grid == rhs._grid);
+        assert(lhs.checkerboard == rhs.checkerboard);
+    }
+
+}
+#endif
--- a/lib/lattice/Grid_lattice_coordinate.h
+++ b/lib/lattice/Grid_lattice_coordinate.h
@@ -0,0 +1,45 @@
+#ifndef GRID_LATTICE_COORDINATE_H
+#define GRID_LATTICE_COORDINATE_H
+
+namespace Grid {
+
+    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
+    {
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+      std::vector<int> gcoor;
+      std::vector<Integer> mergebuf(Nsimd);
+      std::vector<Integer *> mergeptr(Nsimd);
+      vInteger vI;
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  //	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
+	  mergebuf[i]=gcoor[mu];
+	  mergeptr[i]=&mergebuf[i];
+	}
+	merge(vI,mergeptr);
+	l._odata[o]=vI;
+      }
+    };
+
+    // LatticeCoordinate();
+    // FIXME for debug; deprecate this; made obscelete by 
+    template<class vobj> void lex_sites(Lattice<vobj> &l){
+      Real *v_ptr = (Real *)&l._odata[0];
+      size_t o_len = l._grid->oSites();
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t vec_len = vRealF::Nsimd();
+
+      for(int i=0;i<o_len;i++){
+	for(int j=0;j<v_len;j++){
+          for(int vv=0;vv<vec_len;vv+=2){
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+	  }
+	}}
+    }
+
+
+}
+#endif
--- a/lib/lattice/Grid_lattice_local.h
+++ b/lib/lattice/Grid_lattice_local.h
@@ -0,0 +1,54 @@
+#ifndef GRID_LATTICE_LOCALREDUCTION_H
+#define GRID_LATTICE_LOCALREDUCTION_H
+
+///////////////////////////////////////////////
+// localInner, localNorm, outerProduct
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////
+    // Non site, reduced locally reduced routines
+    /////////////////////////////////////////////////////
+
+    // localNorm2,
+    template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    
+    // localInnerProduct
+    template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+      -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+
+}
+
+#endif
--- a/lib/lattice/Grid_lattice_peekpoke.h
+++ b/lib/lattice/Grid_lattice_peekpoke.h
@@ -0,0 +1,143 @@
+#ifndef GRID_LATTICE_PEEK_H
+#define GRID_LATTICE_PEEK_H
+
+///////////////////////////////////////////////
+// Peeking and poking around
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Peek internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto peekIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+        }
+        return ret;
+    };
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Poke internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> & rhs)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss]);
+	}      
+    }
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+	}      
+    }
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+	}      
+    }
+
+    //////////////////////////////////////////////////////
+    // Poke a scalar object into the SIMD array
+    //////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+    void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+      // Optional to broadcast from node 0.
+      grid->Broadcast(0,s);
+
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      // extract-modify-merge cycle is easiest way and this is not perf critical
+      extract(l._odata[odx],pointers);
+      
+      buf[idx] = s;
+
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+      merge(l._odata[odx],pointers);
+
+      return;
+    };
+
+    //////////////////////////////////////////////////////////
+    // Peek a scalar object from the SIMD array
+    //////////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+      void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+        
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      extract(l._odata[odx],pointers);
+      
+      s = buf[idx];
+      grid->Broadcast(rank,s);
+
+      return;
+    };
+
+}
+#endif
+
--- a/lib/lattice/Grid_lattice_reality.h
+++ b/lib/lattice/Grid_lattice_reality.h
@@ -0,0 +1,52 @@
+#ifndef GRID_LATTICE_REALITY_H
+#define GRID_LATTICE_REALITY_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+namespace Grid {
+
+    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = adj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = conj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
+    {
+      Lattice<decltype(real(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = real(z._odata[ss]);
+        }
+      return ret;
+    }
+
+    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
+    {
+      Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = imag(z._odata[ss]);
+        }
+      return ret;
+    }
+
+
+}
+#endif
--- a/lib/lattice/Grid_lattice_reduction.h
+++ b/lib/lattice/Grid_lattice_reduction.h
@@ -0,0 +1,45 @@
+#ifndef GRID_LATTICE_REDUCTION_H
+#define GRID_LATTICE_REDUCTION_H
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Reduction operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline RealD norm2(const Lattice<vobj> &arg){
+
+      typedef typename vobj::scalar_type scalar;
+      typedef typename vobj::vector_type vector;
+      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      vnrm=zero;
+      for(int ss=0;ss<arg._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
+      }
+      vector vvnrm =TensorRemove(vnrm) ;
+      nrm = Reduce(vvnrm);
+      arg._grid->GlobalSum(nrm);
+      return real(nrm);
+    }
+
+    template<class vobj>
+    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
+    {
+      typedef typename vobj::scalar_type scalar;
+      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      for(int ss=0;ss<left._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+      }
+      nrm = Reduce(vnrm);
+      right._grid->GlobalSum(nrm);
+      return nrm;
+    }
+
+}
+#endif
+
--- a/lib/lattice/Grid_lattice_rng.h
+++ b/lib/lattice/Grid_lattice_rng.h
@@ -0,0 +1,32 @@
+#ifndef GRID_LATTICE_RNG_H
+#define GRID_LATTICE_RNG_H
+
+namespace Grid {
+
+    // FIXME Randomise; deprecate this
+    template <class vobj> inline void random(Lattice<vobj> &l){
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+	
+        for(int i=0;i<d_len;i++){
+
+            v_ptr[i]=drand48();
+        }
+    };
+    
+    // FIXME Implement a consistent seed management strategy
+    template <class vobj> inline void gaussian(Lattice<vobj> &l){
+        // Zero mean, unit variance.
+        std::normal_distribution<double> distribution(0.0,1.0);
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+
+        for(int i=0;i<d_len;i++){
+	  v_ptr[i]= drand48();
+        }
+    };
+
+}
+#endif
--- a/lib/lattice/Grid_lattice_trace.h
+++ b/lib/lattice/Grid_lattice_trace.h
@@ -0,0 +1,43 @@
+#ifndef GRID_LATTICE_TRACE_H
+#define GRID_LATTICE_TRACE_H
+
+///////////////////////////////////////////////
+// Tracing, transposing, peeking, poking
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline auto trace(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(trace(lhs._odata[0]))>
+    {
+      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = trace(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace Index level dependent operation
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto traceIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
+      }
+      return ret;
+    };
+
+
+}
+#endif
+
--- a/lib/lattice/Grid_lattice_transfer.h
+++ b/lib/lattice/Grid_lattice_transfer.h
@@ -0,0 +1,46 @@
+#ifndef GRID_LATTICE_TRANSFER_H
+#define GRID_LATTICE_TRANSFER_H
+
+namespace Grid {
+
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // remove and insert a half checkerboard
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
+    half.checkerboard = cb;
+    int ssh=0;
+#pragma omp parallel for
+    for(int ss=0;ss<full._grid->oSites();ss++){
+      std::vector<int> coor;
+      int cbos;
+      
+      full._grid->oCoorFromOindex(coor,ss);
+      cbos=half._grid->CheckerBoard(coor);
+      
+      if (cbos==cb) {
+	
+	half._odata[ssh] = full._odata[ss];
+	ssh++;
+      }
+    }
+  }
+  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
+    int cb = half.checkerboard;
+    int ssh=0;
+#pragma omp parallel for
+    for(int ss=0;ss<full._grid->oSites();ss++){
+      std::vector<int> coor;
+      int cbos;
+      
+      full._grid->oCoorFromOindex(coor,ss);
+      cbos=half._grid->CheckerBoard(coor);
+      
+      if (cbos==cb) {
+	full._odata[ss]=half._odata[ssh];
+	ssh++;
+      }
+    }
+  }
+  
+}
+#endif
--- a/lib/lattice/Grid_lattice_transpose.h
+++ b/lib/lattice/Grid_lattice_transpose.h
@@ -0,0 +1,39 @@
+#ifndef GRID_LATTICE_TRANSPOSE_H
+#define GRID_LATTICE_TRANSPOSE_H
+
+///////////////////////////////////////////////
+// Transpose
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transpose(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Index level dependent transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto transposeIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+}
+#endif