Global changes to parallel_for structure.

Move the comms flags to more sensible names
2026-06-24 20:43:29 +01:00 · 2017-02-21 05:24:27 -05:00
parent 3906cd2149
commit 3ae92fa2e6
43 changed files with 271 additions and 513 deletions
@@ -39,8 +39,7 @@ namespace Grid {
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
@@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
@@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
@@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
@@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
@@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
@@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<x._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
@@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<x._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
@@ -121,8 +121,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;

-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      //vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,eval(ss,expr));
@@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -235,8 +230,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;

    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<_grid->oSites();ss++){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
      vstream(_odata[ss] ,eval(ss,expr));
    }
  };
@@ -258,8 +252,7 @@ PARALLEL_FOR_LOOP
    	_grid = r._grid;
    	checkerboard = r.checkerboard;
    	_odata.resize(_grid->oSites());// essential
-  		PARALLEL_FOR_LOOP
-        for(int ss=0;ss<_grid->oSites();ss++){
+	parallel_for(int ss=0;ss<_grid->oSites();ss++){
            _odata[ss]=r._odata[ss];
        }  	
    }
@@ -269,8 +262,7 @@ PARALLEL_FOR_LOOP
    virtual ~Lattice(void) = default;
    
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<_grid->oSites();ss++){
+      parallel_for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r;
        }
        return *this;
@@ -279,8 +271,7 @@ PARALLEL_FOR_LOOP
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
      
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<_grid->oSites();ss++){
+      parallel_for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
        }
        return *this;
@@ -45,90 +45,87 @@ namespace Grid {
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj>  
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
-    {
-      Lattice<vInteger> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
    }
+    return ret;
+  }
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to scalar
  //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
-    {
-      Lattice<vInteger> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites(); ss++){
-	  ret._odata[ss]=op(lhs._odata[ss],rhs);
-        }
-        return ret;
+  {
+    Lattice<vInteger> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
    }
+    return ret;
+  }
  //////////////////////////////////////////////////////////////////////////
  // compare scalar to lattice
  //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
-    {
-      Lattice<vInteger> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  ret._odata[ss]=op(lhs._odata[ss],rhs);
-        }
-        return ret;
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
    }
-
+    return ret;
+  }
+  
  //////////////////////////////////////////////////////////////////////////
  // Map to functors
  //////////////////////////////////////////////////////////////////////////
-    // Less than
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
-     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
-
-   // Less than equal
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vle<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vle<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
-     return SLComparison(vle<lobj,robj>(),lhs,rhs);
-   }
-
-   // Greater than 
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
-   template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+  // Less than
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Less than equal
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Greater than 
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
-
-
-   // Greater than equal
+  }
+  
+  
+  // Greater than equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
@@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }
-
+   
   // equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
   }
-
-
+   
+   
   // not equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
   }
-
 }
 #endif
@@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-    /////////////////////////////////////////////////////
-    // Non site, reduced locally reduced routines
-    /////////////////////////////////////////////////////
-
-    // localNorm2,
-    template<class vobj>
+  /////////////////////////////////////////////////////
+  // Non site, reduced locally reduced routines
+  /////////////////////////////////////////////////////
+  
+  // localNorm2,
+  template<class vobj>
    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
    }
-    
-    // localInnerProduct
-    template<class vobj>
+  
+  // localInnerProduct
+  template<class vobj>
    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
      }
      return ret;
    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
+  
+  // outerProduct Scalar x Scalar -> Scalar
+  //              Vector x Vector -> Matrix
+  template<class ll,class rr>
    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }
-
+  {
+    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+    }
+    return ret;
+  }
 }
-
 #endif
@@ -37,8 +37,7 @@ namespace Grid {
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
    Lattice<vobj> ret(r._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<r._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
      vstream(ret._odata[ss], -r._odata[ss]);
    }
    return ret;
@@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
  {
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
      vstream(ret._odata[ss],tmp);
 	   //      ret._odata[ss]=lhs*rhs._odata[ss];
@@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
    {
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 	vstream(ret._odata[ss],tmp);
 	//	ret._odata[ss]=lhs+rhs._odata[ss];
@@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
  {
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
      vstream(ret._odata[ss],tmp);
-      //      ret._odata[ss]=lhs-rhs._odata[ss];
    }
    return ret;
  }
@@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
    {
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 	vstream(ret._odata[ss],tmp);
 	//            ret._odata[ss]=lhs._odata[ss]*rhs;
@@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
    {
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 	  vstream(ret._odata[ss],tmp);
 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
@@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
    {
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 	  vstream(ret._odata[ss],tmp);
 	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
      }
      return ret;
    }
-
-
 }
 #endif
@@ -44,22 +44,20 @@ namespace Grid {
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
-        }
-        return ret;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+      }
+      return ret;
    };
    template<int Index,class vobj>
-       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
-        }
-        return ret;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+      }
+      return ret;
    };

    ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
    template<int Index,class vobj> 
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
    {
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
-	}      
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+      }      
    }
    template<int Index,class vobj>
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
    {
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
-	}      
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+      }      
    }

    //////////////////////////////////////////////////////
@@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP

      assert( l.checkerboard == l._grid->CheckerBoard(site));

-      // FIXME
-      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
-
      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);

@@ -40,8 +40,7 @@ namespace Grid {

    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = adj(lhs._odata[ss]);
        }
        return ret;
@@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP

    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = conjugate(lhs._odata[ss]);
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = conjugate(lhs._odata[ss]);
        }
        return ret;
    };
-
-
 }
 #endif
@@ -57,8 +57,7 @@ namespace Grid {
 	sumarray[i]=zero;
      }

-PARALLEL_FOR_LOOP
-      for(int thr=0;thr<grid->SumArraySize();thr++){
+      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
 	
@@ -68,7 +67,7 @@ PARALLEL_FOR_LOOP
 	}
 	sumarray[thr]=TensorRemove(vnrm) ;
      }
-    
+      
      vector_type vvnrm; vvnrm=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
 	vvnrm = vvnrm+sumarray[i];
@@ -114,18 +113,17 @@ PARALLEL_FOR_LOOP
 	sumarray[i]=zero;
      }

-PARALLEL_FOR_LOOP
-      for(int thr=0;thr<grid->SumArraySize();thr++){
+      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
-
+	
 	vobj vvsum=zero;
        for(int ss=myoff;ss<mywork+myoff; ss++){
 	  vvsum = vvsum + arg._odata[ss];
 	}
 	sumarray[thr]=vvsum;
      }
-
+      
      vobj vsum=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
 	vsum = vsum+sumarray[i];
@@ -302,8 +302,7 @@ namespace Grid {
      int words=sizeof(scalar_object)/sizeof(scalar_type);


-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<osites;ss++){
+      parallel_for(int ss=0;ss<osites;ss++){

 	std::vector<scalar_object> buf(Nsimd);
 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
@@ -42,8 +42,7 @@ namespace Grid {
      -> Lattice<decltype(trace(lhs._odata[0]))>
    {
      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = trace(lhs._odata[ss]);
        }
        return ret;
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<lhs._grid->oSites();ss++){
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
      }
      return ret;
@@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
    int ssh=0;
-    //PARALLEL_FOR_LOOP
+    //parallel_for
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
    int ssh=0;
-    //PARALLEL_FOR_LOOP
+    //parallel_for
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
  }

-PARALLEL_FOR_LOOP
-  for(int sf=0;sf<fine->oSites();sf++){
+  parallel_for(int sf=0;sf<fine->oSites();sf++){
    
    int sc;
    std::vector<int> coor_c(_ndimension);
@@ -186,8 +185,7 @@ template<class vobj,class CComplex>

  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-PARALLEL_FOR_LOOP
-  for(int ss=0;ss<coarse->oSites();ss++){
+  parallel_for(int ss=0;ss<coarse->oSites();ss++){
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
  }
 }
@@ -347,8 +345,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->lSites() == og->lSites());
  }

-  PARALLEL_FOR_LOOP
-  for(int idx=0;idx<ig->lSites();idx++){
+  parallel_for(int idx=0;idx<ig->lSites();idx++){
    sobj s;
    ssobj ss;

@@ -386,8 +383,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }

  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
-  for(int idx=0;idx<lg->lSites();idx++){
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -428,8 +424,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
    }
  }
  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
-  for(int idx=0;idx<lg->lSites();idx++){
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -468,8 +463,7 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
  }

  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
-  for(int idx=0;idx<lg->lSites();idx++){
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -504,8 +498,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
  }

  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
-  for(int idx=0;idx<lg->lSites();idx++){
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
  
-PARALLEL_FOR_LOOP
-  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);

@@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
    
-  PARALLEL_FOR_LOOP
-  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
+  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);

@@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
-
-
-  
-
 
 }
 #endif
@@ -40,27 +40,24 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
-        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = transpose(lhs._odata[ss]);
-        }
-        return ret;
-    };
+    Lattice<vobj> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transpose(lhs._odata[ss]);
+    }
+    return ret;
+  };
    
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Index level dependent transpose
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<int Index,class vobj>
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Index level dependent transpose
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<int Index,class vobj>
    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
-    {
-      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
-        }
-        return ret;
-    };
-
+  {
+    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+    }
+    return ret;
+  };
 }
 #endif
@@ -37,8 +37,7 @@ namespace Grid {
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=pow(rhs._odata[ss],y);
    }
    return ret;
@@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=mod(rhs._odata[ss],y);
    }
    return ret;
@@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=div(rhs._odata[ss],y);
    }
    return ret;
@@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
    }
    return ret;
@@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  std::vector<scalar_object> truevals (Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);

-PARALLEL_FOR_LOOP
-  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){

    extract(iftrue._odata[ss]   ,truevals);
    extract(iffalse._odata[ss]  ,falsevals);