Global changes to parallel_for structure.

Move the comms flags to more sensible names
2025-10-14 05:04:42 +01:00 · 2017-02-21 05:24:27 -05:00
parent 3906cd2149
commit 3ae92fa2e6
43 changed files with 271 additions and 513 deletions
--- a/benchmarks/Benchmark_memory_asynch.cc
+++ b/benchmarks/Benchmark_memory_asynch.cc
@@ -77,8 +77,7 @@ int main (int argc, char ** argv)
    }
    double start=usecond();
-PARALLEL_FOR_LOOP
+    parallel_for(int t=0;t<threads;t++){
    for(int t=0;t<threads;t++){
      sum[t] = x[t]._odata[0];
      for(int i=0;i<Nloop;i++){
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -342,11 +342,11 @@ void Grid_init(int *argc,char ***argv)
  } else {
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
-    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
+    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sendrecv") ){
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
-    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySendrecv);
+    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -81,77 +81,14 @@ template<class vobj,class cobj,class compressor>
 void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 {
  int num=table.size();
-PARALLEL_FOR_LOOP
+  parallel_for(int i=0;i<num;i++){
  for(int i=0;i<num;i++){
    vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
  }
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 /*
 template<class cobj,class vobj,class compressor> double
 Gather_plane_exchange(const Lattice<vobj> &rhs,
 		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  double t1,t2;
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1  =rhs._grid->_slice_nblock[dimension];
  int e2  =rhs._grid->_slice_block [dimension];
  int n1  =rhs._grid->_slice_stride[dimension];
  // Need to switch to a table loop
  std::vector<std::pair<int,int> > table;
  if ( cbmask ==0x3){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	table.push_back(std::pair<int,int> (offset,o+b));
      }
    }
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*n1;
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  table.push_back(std::pair<int,int> (offset,o+b));
 	}
      }
    }
  }
  assert( (table.size()&0x1)==0);
  t1=usecond();
 PARALLEL_FOR_LOOP     
  for(int j=0;j<table.size()/2;j++){
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
    cobj temp3;
    cobj temp4;
    exchange(temp3,temp4,temp1,temp2,type);
    vstream(pointers[0][j],temp3);
    vstream(pointers[1][j],temp4);
  }
  t2=usecond();
  return t2-t1;
 }
 */
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
@@ -164,8 +101,7 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
  assert( (table.size()&0x1)==0);
  int num=table.size()/2;
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-PARALLEL_FOR_LOOP     
+  parallel_for(int j=0;j<num;j++){
  for(int j=0;j<num;j++){
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
@@ -235,19 +171,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,
 					  Packets[i].bytes);
      if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) {
 	_grid->StencilSendToRecvFromComplete(reqs[i]);
      }
    }
    commtime+=usecond();
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    commtime-=usecond();
-    if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) {
+    for(int i=0;i<Packets.size();i++){
-      for(int i=0;i<Packets.size();i++){
+      _grid->StencilSendToRecvFromComplete(reqs[i]);
 	_grid->StencilSendToRecvFromComplete(reqs[i]);
      }
    }
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    commtime+=usecond();
@@ -327,14 +258,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
      //      std::ofstream fout(fname);
      if ( Mergers[i].exchange == 0 ) { 
-PARALLEL_FOR_LOOP
+	parallel_for(int o=0;o<Mergers[i].buffer_size;o++){
        for(int o=0;o<Mergers[i].buffer_size;o++){
 	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 	}
      } else { 
-PARALLEL_FOR_LOOP
+	parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){
        for(int o=0;o<Mergers[i].buffer_size/2;o++){
 	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
 		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 	  //	  cobj temp1,temp2;
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -51,6 +51,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define PARALLEL_CRITICAL
 #endif
 #define parallel_for       PARALLEL_FOR_LOOP for
 #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for
 namespace Grid {
  // Introduce a class to gain deterministic bit reproducible reduction.
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -267,8 +267,7 @@ namespace Grid {
      SimpleCompressor<siteVector> compressor;
      Stencil.HaloExchange(in,compressor);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<Grid()->oSites();ss++){
      for(int ss=0;ss<Grid()->oSites();ss++){
        siteVector res = zero;
 	siteVector nbr;
 	int ptype;
@@ -380,8 +379,7 @@ PARALLEL_FOR_LOOP
 	  Subspace.ProjectToSubspace(oProj,oblock);
 	  //	  blockProject(iProj,iblock,Subspace.subspace);
 	  //	  blockProject(oProj,oblock,Subspace.subspace);
-PARALLEL_FOR_LOOP
+	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){
 	  for(int ss=0;ss<Grid()->oSites();ss++){
 	    for(int j=0;j<nbasis;j++){
 	      if( disp!= 0 ) {
 		A[p]._odata[ss](j,i) = oProj._odata[ss](j);
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -33,7 +33,7 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicySendrecv;
+CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 /////////////////////////////////
 // Alloc, free shmem region
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -118,7 +118,7 @@ class CartesianCommunicator {
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
-  enum CommunicatorPolicy_t { CommunicatorPolicyIsend , CommunicatorPolicySendrecv };
+  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -158,7 +158,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  int myrank = _processor;
  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
@@ -178,7 +178,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
-  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  int myrank = _processor;
  int ierr;
-  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
@@ -567,6 +567,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
    this->StencilSendToRecvFromComplete(list);
  }
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
@@ -585,8 +590,8 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  list.resize(0);
  assert(ierr==0);
  list.resize(0);
 }
 void CartesianCommunicator::Barrier(void)
 {
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -58,8 +58,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
@@ -78,8 +77,7 @@ PARALLEL_NESTED_LOOP2
 	 }
       }
     }
-PARALLEL_FOR_LOOP     
+     parallel_for(int i=0;i<table.size();i++){
     for(int i=0;i<table.size();i++){
       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
     }
  }
@@ -105,8 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  int n1=rhs._grid->_slice_stride[dimension];
  if ( cbmask ==0x3){
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      =   n*n1;
@@ -122,8 +119,7 @@ PARALLEL_NESTED_LOOP2
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    std::cout << " Dense packed buffer WARNING " <<std::endl;
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*n1;
@@ -175,8 +171,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask ==0x3 ) {
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
@@ -195,8 +190,7 @@ PARALLEL_NESTED_LOOP2
 	}
      }
    }
-PARALLEL_FOR_LOOP     
+    parallel_for(int i=0;i<table.size();i++){
     for(int i=0;i<table.size();i++){
       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
       rhs._odata[table[i].first]=buffer[table[i].second];
     }
@@ -220,8 +214,7 @@ PARALLEL_FOR_LOOP
  int e2=rhs._grid->_slice_block[dimension];
  if(cbmask ==0x3 ) {
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      = n*rhs._grid->_slice_stride[dimension];
 	int offset = b+n*rhs._grid->_slice_block[dimension];
@@ -265,8 +258,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e2=rhs._grid->_slice_block[dimension];
  int stride = rhs._grid->_slice_stride[dimension];
  if(cbmask == 0x3 ){
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
@@ -275,8 +267,7 @@ PARALLEL_NESTED_LOOP2
      }
    }
  } else { 
-PARALLEL_NESTED_LOOP2
+    parallel_for_nest2(int n=0;n<e1;n++){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
        int o =n*stride+b;
@@ -306,8 +297,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];
-PARALLEL_NESTED_LOOP2
+
-  for(int n=0;n<e1;n++){
+  parallel_for_nest2(int n=0;n<e1;n++){
  for(int b=0;b<e2;b++){
      int o  =n*stride;
--- a/lib/lattice/Lattice_arith.h
+++ b/lib/lattice/Lattice_arith.h
@@ -39,8 +39,7 @@ namespace Grid {
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,rhs);
    conformable(lhs,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mult(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
      obj1 tmp;
      mac(&tmp,&lhs._odata[ss],&rhs);
      vstream(ret._odata[ss],tmp);
@@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(ret,lhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs._odata[ss],&rhs);
@@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
    ret.checkerboard = lhs.checkerboard;
    conformable(lhs,ret);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs._odata[ss],&rhs);
@@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mult(&tmp,&lhs,&rhs._odata[ss]);
@@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      mac(&tmp,&lhs,&rhs._odata[ss]);
@@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      sub(&tmp,&lhs,&rhs._odata[ss]);
@@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      obj1 tmp;
      add(&tmp,&lhs,&rhs._odata[ss]);
@@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
    for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+y._odata[ss];
      vstream(ret._odata[ss],tmp);
@@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
    ret.checkerboard = x.checkerboard;
    conformable(ret,x);
    conformable(x,y);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
    for(int ss=0;ss<x._grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
      vstream(ret._odata[ss],tmp);
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -121,8 +121,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
    assert( (cb==Odd) || (cb==Even));
    checkerboard=cb;
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      //vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,eval(ss,expr));
@@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;
    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;
    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
@@ -235,8 +230,7 @@ PARALLEL_FOR_LOOP
    checkerboard=cb;
    _odata.resize(_grid->oSites());
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
    for(int ss=0;ss<_grid->oSites();ss++){
      vstream(_odata[ss] ,eval(ss,expr));
    }
  };
@@ -258,8 +252,7 @@ PARALLEL_FOR_LOOP
    	_grid = r._grid;
    	checkerboard = r.checkerboard;
    	_odata.resize(_grid->oSites());// essential
-  		PARALLEL_FOR_LOOP
+	parallel_for(int ss=0;ss<_grid->oSites();ss++){
        for(int ss=0;ss<_grid->oSites();ss++){
            _odata[ss]=r._odata[ss];
        }  	
    }
@@ -269,8 +262,7 @@ PARALLEL_FOR_LOOP
    virtual ~Lattice(void) = default;
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<_grid->oSites();ss++){
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r;
        }
        return *this;
@@ -279,8 +271,7 @@ PARALLEL_FOR_LOOP
      this->checkerboard = r.checkerboard;
      conformable(*this,r);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<_grid->oSites();ss++){
        for(int ss=0;ss<_grid->oSites();ss++){
            this->_odata[ss]=r._odata[ss];
        }
        return *this;
--- a/lib/lattice/Lattice_comparison.h
+++ b/lib/lattice/Lattice_comparison.h
@@ -45,90 +45,87 @@ namespace Grid {
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj>  
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
-    {
+  {
-      Lattice<vInteger> ret(rhs._grid);
+    Lattice<vInteger> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
 	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
        }
        return ret;
    }
    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to scalar
  //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
-    {
+  {
-      Lattice<vInteger> ret(lhs._grid);
+    Lattice<vInteger> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
-        for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
 	  ret._odata[ss]=op(lhs._odata[ss],rhs);
        }
        return ret;
    }
    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // compare scalar to lattice
  //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+  template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
-    {
+  {
-      Lattice<vInteger> ret(rhs._grid);
+    Lattice<vInteger> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
 	  ret._odata[ss]=op(lhs._odata[ss],rhs);
        }
        return ret;
    }
-
+    return ret;
  }
  //////////////////////////////////////////////////////////////////////////
  // Map to functors
  //////////////////////////////////////////////////////////////////////////
-    // Less than
+  // Less than
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
-     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
-   }
+  }
-
+  
-   // Less than equal
+  // Less than equal
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vle<lobj,robj>(),lhs,rhs);
+    return LLComparison(vle<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vle<lobj,robj>(),lhs,rhs);
+    return LSComparison(vle<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
-     return SLComparison(vle<lobj,robj>(),lhs,rhs);
+    return SLComparison(vle<lobj,robj>(),lhs,rhs);
-   }
+  }
-
+  
-   // Greater than 
+  // Greater than 
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
-     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
-     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
+  }
-   template<class lobj,class robj>
+  template<class lobj,class robj>
-   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
-   }
+  }
-
+  
-
+  
-   // Greater than equal
+  // Greater than equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
@@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }
-
+   
   // equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
   }
-
+   
-
+   
   // not equal
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
-   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
   }
 }
 #endif
--- a/lib/lattice/Lattice_local.h
+++ b/lib/lattice/Lattice_local.h
@@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
-    /////////////////////////////////////////////////////
+  /////////////////////////////////////////////////////
-    // Non site, reduced locally reduced routines
+  // Non site, reduced locally reduced routines
-    /////////////////////////////////////////////////////
+  /////////////////////////////////////////////////////
-
+  
-    // localNorm2,
+  // localNorm2,
-    template<class vobj>
+  template<class vobj>
    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
-	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+      }
-        }
+      return ret;
        return ret;
    }
-    
+  
-    // localInnerProduct
+  // localInnerProduct
-    template<class vobj>
+  template<class vobj>
    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
    {
      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
      }
      return ret;
    }
-    
+  
-    // outerProduct Scalar x Scalar -> Scalar
+  // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
+  //              Vector x Vector -> Matrix
-    template<class ll,class rr>
+  template<class ll,class rr>
    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
+  {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+    }
-        }
+    return ret;
-        return ret;
+  }
     }
 }
 #endif
--- a/lib/lattice/Lattice_overload.h
+++ b/lib/lattice/Lattice_overload.h
@@ -37,8 +37,7 @@ namespace Grid {
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
    Lattice<vobj> ret(r._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
    for(int ss=0;ss<r._grid->oSites();ss++){
      vstream(ret._odata[ss], -r._odata[ss]);
    }
    return ret;
@@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
  {
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
      vstream(ret._odata[ss],tmp);
 	   //      ret._odata[ss]=lhs*rhs._odata[ss];
@@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
    {
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 	vstream(ret._odata[ss],tmp);
 	//	ret._odata[ss]=lhs+rhs._odata[ss];
@@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
  {
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
    for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
      vstream(ret._odata[ss],tmp);
      //      ret._odata[ss]=lhs-rhs._odata[ss];
    }
    return ret;
  }
@@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
    {
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
      for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 	vstream(ret._odata[ss],tmp);
 	//            ret._odata[ss]=lhs._odata[ss]*rhs;
@@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
    {
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 	  vstream(ret._odata[ss],tmp);
 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
@@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
    {
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 	  vstream(ret._odata[ss],tmp);
 	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
      }
      return ret;
    }
 }
 #endif
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -44,22 +44,20 @@ namespace Grid {
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+      }
-        }
+      return ret;
        return ret;
    };
    template<int Index,class vobj>
-       auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
    {
      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
      ret.checkerboard=lhs.checkerboard;
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+      }
-        }
+      return ret;
        return ret;
    };
    ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
    template<int Index,class vobj> 
    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
    {
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+      }      
 	}      
    }
    template<int Index,class vobj>
      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
    {
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+      }      
 	}      
    }
    //////////////////////////////////////////////////////
@@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard == l._grid->CheckerBoard(site));
      // FIXME
      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
--- a/lib/lattice/Lattice_reality.h
+++ b/lib/lattice/Lattice_reality.h
@@ -40,8 +40,7 @@ namespace Grid {
    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
        for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = adj(lhs._odata[ss]);
        }
        return ret;
@@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP
    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = conjugate(lhs._odata[ss]);
            ret._odata[ss] = conjugate(lhs._odata[ss]);
        }
        return ret;
    };
 }
 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -57,8 +57,7 @@ namespace Grid {
 	sumarray[i]=zero;
      }
-PARALLEL_FOR_LOOP
+      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
      for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
@@ -68,7 +67,7 @@ PARALLEL_FOR_LOOP
 	}
 	sumarray[thr]=TensorRemove(vnrm) ;
      }
-    
+      
      vector_type vvnrm; vvnrm=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
 	vvnrm = vvnrm+sumarray[i];
@@ -114,18 +113,17 @@ PARALLEL_FOR_LOOP
 	sumarray[i]=zero;
      }
-PARALLEL_FOR_LOOP
+      parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
      for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
-
+	
 	vobj vvsum=zero;
        for(int ss=myoff;ss<mywork+myoff; ss++){
 	  vvsum = vvsum + arg._odata[ss];
 	}
 	sumarray[thr]=vvsum;
      }
-
+      
      vobj vsum=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
 	vsum = vsum+sumarray[i];
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -302,8 +302,7 @@ namespace Grid {
      int words=sizeof(scalar_object)/sizeof(scalar_type);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<osites;ss++){
      for(int ss=0;ss<osites;ss++){
 	std::vector<scalar_object> buf(Nsimd);
 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
--- a/lib/lattice/Lattice_trace.h
+++ b/lib/lattice/Lattice_trace.h
@@ -42,8 +42,7 @@ namespace Grid {
      -> Lattice<decltype(trace(lhs._odata[0]))>
    {
      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
        for(int ss=0;ss<lhs._grid->oSites();ss++){
            ret._odata[ss] = trace(lhs._odata[ss]);
        }
        return ret;
@@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
      for(int ss=0;ss<lhs._grid->oSites();ss++){
 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
      }
      return ret;
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
    half.checkerboard = cb;
    int ssh=0;
-    //PARALLEL_FOR_LOOP
+    //parallel_for
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
    int cb = half.checkerboard;
    int ssh=0;
-    //PARALLEL_FOR_LOOP
+    //parallel_for
    for(int ss=0;ss<full._grid->oSites();ss++){
      std::vector<int> coor;
      int cbos;
@@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
  }
-PARALLEL_FOR_LOOP
+  parallel_for(int sf=0;sf<fine->oSites();sf++){
  for(int sf=0;sf<fine->oSites();sf++){
    int sc;
    std::vector<int> coor_c(_ndimension);
@@ -186,8 +185,7 @@ template<class vobj,class CComplex>
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<coarse->oSites();ss++){
  for(int ss=0;ss<coarse->oSites();ss++){
    CoarseInner._odata[ss] = coarse_inner._odata[ss];
  }
 }
@@ -347,8 +345,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->lSites() == og->lSites());
  }
-  PARALLEL_FOR_LOOP
+  parallel_for(int idx=0;idx<ig->lSites();idx++){
  for(int idx=0;idx<ig->lSites();idx++){
    sobj s;
    ssobj ss;
@@ -386,8 +383,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }
  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
  for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -428,8 +424,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
    }
  }
  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
  for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -468,8 +463,7 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
  }
  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
  for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -504,8 +498,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
  }
  // the above should guarantee that the operations are local
-  PARALLEL_FOR_LOOP
+  parallel_for(int idx=0;idx<lg->lSites();idx++){
  for(int idx=0;idx<lg->lSites();idx++){
    sobj s;
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
-PARALLEL_FOR_LOOP
+  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
  for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);
@@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
  std::vector<SobjOut> in_slex_conv(in_grid->lSites());
  unvectorizeToLexOrdArray(in_slex_conv, in);
-  PARALLEL_FOR_LOOP
+  parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
  for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
    std::vector<int> out_ocoor(ndim);
    out_grid->oCoorFromOindex(out_ocoor, out_oidx);
@@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
    merge(out._odata[out_oidx], ptrs, 0);
  }
 }
 }
 #endif
--- a/lib/lattice/Lattice_transpose.h
+++ b/lib/lattice/Lattice_transpose.h
@@ -40,27 +40,24 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
-        Lattice<vobj> ret(lhs._grid);
+    Lattice<vobj> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transpose(lhs._odata[ss]);
-            ret._odata[ss] = transpose(lhs._odata[ss]);
+    }
-        }
+    return ret;
-        return ret;
+  };
    };
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Index level dependent transpose
+  // Index level dependent transpose
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<int Index,class vobj>
+  template<int Index,class vobj>
    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
-    {
+  {
-      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
-            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+    }
-        }
+    return ret;
-        return ret;
+  };
    };
 }
 #endif
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -37,8 +37,7 @@ namespace Grid {
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=pow(rhs._odata[ss],y);
    }
    return ret;
@@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=mod(rhs._odata[ss],y);
    }
    return ret;
@@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=div(rhs._odata[ss],y);
    }
    return ret;
@@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
    }
    return ret;
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  std::vector<scalar_object> truevals (Nsimd);
  std::vector<scalar_object> falsevals(Nsimd);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
    extract(iftrue._odata[ss]   ,truevals);
    extract(iffalse._odata[ss]  ,falsevals);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -54,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
-PARALLEL_FOR_LOOP
+
-  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
      auto tmp = psi._odata[0];
      if ( s==0 ) {
@@ -98,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
-PARALLEL_FOR_LOOP
+
-  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    for(int s=0;s<Ls;s++){
      if ( s==0 ) {
@@ -137,8 +137,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
@@ -184,8 +183,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -91,8 +91,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  assert(Nc==3);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 #if 0
      alignas(64) SiteHalfSpinor hp;
      alignas(64) SiteHalfSpinor hm;
@@ -232,8 +231,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  M5Dcalls++;
  M5Dtime-=usecond();
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 #if 0
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
@@ -792,13 +790,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  MooeeInvTime-=usecond();
  if ( switcheroo<Coeff_t>::iscomplex() ) {
-  PARALLEL_FOR_LOOP
+    parallel_for(auto site=0;site<vol;site++){
    for(auto site=0;site<vol;site++){
      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
  } else { 
-  PARALLEL_FOR_LOOP
+    parallel_for(auto site=0;site<vol;site++){
    for(auto site=0;site<vol;site++){
      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
  }
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -194,8 +194,7 @@ namespace QCD {
      GaugeLinkField tmp(mat._grid);
      tmp = zero;
-      PARALLEL_FOR_LOOP
+      parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){
      for(int sss=0;sss<tmp._grid->oSites();sss++){
 	int sU=sss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
@@ -445,8 +444,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
       Uconj = where(coor==neglink,-Uconj,Uconj);
     }
-PARALLEL_FOR_LOOP
+     parallel_for(auto ss=U.begin();ss<U.end();ss++){
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu) = U[ss]();
       Uds[ss](1)(mu) = Uconj[ss]();
     }
@@ -459,8 +457,7 @@ PARALLEL_FOR_LOOP
       Utmp = where(coor==0,Uconj,Utmp);
     }
-PARALLEL_FOR_LOOP
+     parallel_for(auto ss=U.begin();ss<U.end();ss++){
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu+4) = Utmp[ss]();
     }
@@ -469,8 +466,7 @@ PARALLEL_FOR_LOOP
       Utmp = where(coor==0,U,Utmp);
     }
-PARALLEL_FOR_LOOP
+     parallel_for(auto ss=U.begin();ss<U.end();ss++){
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](1)(mu+4) = Utmp[ss]();
     }
@@ -484,8 +480,7 @@ PARALLEL_FOR_LOOP
   GaugeLinkField link(mat._grid);
   // use lorentz for flavour as hack.
   auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-PARALLEL_FOR_LOOP
+   parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
   for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
     link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
   }
   PokeIndex<LorentzIndex>(mat, link, mu);
@@ -498,8 +493,7 @@ PARALLEL_FOR_LOOP
   GaugeLinkField tmp(mat._grid);
   tmp = zero;
-PARALLEL_FOR_LOOP
+   parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) {
   for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
     for (int s = 0; s < Ls; s++) {
       int sF = s + Ls * ss;
       auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -222,8 +222,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    ////////////////////////
    // Call the single hop
    ////////////////////////
-    PARALLEL_FOR_LOOP
+    parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
      Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                               gamma);
    }
@@ -333,8 +332,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
  Stencil.HaloExchange(in, compressor);
-  PARALLEL_FOR_LOOP
+  parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
    Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
  }
 };
@@ -350,13 +348,11 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  st.HaloExchange(in, compressor);
  if (dag == DaggerYes) {
-    PARALLEL_FOR_LOOP
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
      Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
    }
  } else {
-    PARALLEL_FOR_LOOP
+    parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
      Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
    }
  }
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -275,8 +275,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  assert(dirdisp<=7);
  assert(dirdisp>=0);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
@@ -323,8 +322,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
    ////////////////////////
    DerivDhopComputeTime -= usecond();
-    PARALLEL_FOR_LOOP
+    parallel_for (int sss = 0; sss < U._grid->oSites(); sss++) {
    for (int sss = 0; sss < U._grid->oSites(); sss++) {
      for (int s = 0; s < Ls; s++) {
        int sU = sss;
        int sF = s + Ls * sU;
@@ -493,73 +491,18 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
-    PARALLEL_FOR_LOOP
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
  } else {
-    PARALLEL_FOR_LOOP
+    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
  }
  /*
  if (dag == DaggerYes) {
    PARALLEL_FOR_LOOP
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
      Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
 #ifdef AVX512_SWITCHOFF
  } else if (stat.is_init() ) {
    int nthreads;
    stat.start();
 #pragma omp parallel
    {
 #pragma omp master
    nthreads = omp_get_num_threads();
    int mythread = omp_get_thread_num();
    stat.enter(mythread);
 #pragma omp for nowait
    for(int ss=0;ss<U._grid->oSites();ss++) {
      int sU=ss;
      int sF=LLs*sU;
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
    stat.exit(mythread);
    }
    stat.accum(nthreads);
 #endif
  } else {
 #if 1
    PARALLEL_FOR_LOOP
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
 #else
 #ifdef GRID_OMP
 #pragma omp parallel 
 #endif
    {
      int len = U._grid->oSites();
      int me, myoff,mywork;
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
      int sF = LLs * myoff;
      Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out);
    }
 #endif
  }
  */
  DhopComputeTime+=usecond();
 }
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -66,8 +66,7 @@ public:
  // Move this elsewhere? FIXME
  static inline void AddGaugeLink(GaugeField &U, GaugeLinkField &W,
                                  int mu) { // U[mu] += W
-    PARALLEL_FOR_LOOP
+    parallel_for (auto ss = 0; ss < U._grid->oSites(); ss++) {
    for (auto ss = 0; ss < U._grid->oSites(); ss++) {
      U._odata[ss]._internal[mu] =
          U._odata[ss]._internal[mu] + W._odata[ss]._internal;
    }
--- a/lib/qcd/utils/LinalgUtils.h
+++ b/lib/qcd/utils/LinalgUtils.h
@@ -48,8 +48,7 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
  GridBase *grid=x._grid;
  Gamma G5(Gamma::Gamma5);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss++){
  for(int ss=0;ss<grid->oSites();ss++){
    vobj tmp;
    tmp = a*x._odata[ss];
    tmp = tmp + G5*(b*timesI(x._odata[ss]));
@@ -65,8 +64,7 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
    vstream(z._odata[ss+s],tmp);
  }
@@ -81,8 +79,7 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    tmp = G5*x._odata[ss+s]*a;
    tmp = tmp + b*y._odata[ss+sp];
@@ -99,8 +96,7 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    tmp = G5*y._odata[ss+sp]*b;
    tmp = tmp + a*x._odata[ss+s];
@@ -117,8 +113,7 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp1;
    vobj tmp2;
    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
@@ -135,8 +130,7 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    spProj5m(tmp,y._odata[ss+sp]);
    tmp = a*x._odata[ss+s]+b*tmp;
@@ -152,8 +146,7 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
  conformable(x,z);
  GridBase *grid=x._grid;
  int Ls = grid->_rdimensions[0];
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    spProj5p(tmp,y._odata[ss+sp]);
    tmp = a*x._odata[ss+s]+b*tmp;
@@ -169,8 +162,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Gamma5);
-PARALLEL_FOR_LOOP
+  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    vobj tmp;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -221,8 +221,7 @@ class SU {
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);
-    PARALLEL_FOR_LOOP
+    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
    for (int ss = 0; ss < grid->oSites(); ss++) {
      subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
      subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
      subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
@@ -252,8 +251,7 @@ class SU {
    su2SubGroupIndex(i0, i1, su2_index);
    dest = 1.0;  // start out with identity
-    PARALLEL_FOR_LOOP
+    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
    for (int ss = 0; ss < grid->oSites(); ss++) {
      dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
      dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
      dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
--- a/tests/forces/Test_contfrac_force.cc
+++ b/tests/forces/Test_contfrac_force.cc
@@ -31,8 +31,6 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 #define parallel_for PARALLEL_FOR_LOOP for
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
--- a/tests/forces/Test_dwf_force.cc
+++ b/tests/forces/Test_dwf_force.cc
@@ -31,8 +31,6 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 #define parallel_for PARALLEL_FOR_LOOP for
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
--- a/tests/forces/Test_dwf_gpforce.cc
+++ b/tests/forces/Test_dwf_gpforce.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_gp_rect_force.cc
+++ b/tests/forces/Test_gp_rect_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_gpdwf_force.cc
+++ b/tests/forces/Test_gpdwf_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_gpwilson_force.cc
+++ b/tests/forces/Test_gpwilson_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_partfrac_force.cc
+++ b/tests/forces/Test_partfrac_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_wilson_force.cc
+++ b/tests/forces/Test_wilson_force.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_wilson_force_phiMdagMphi.cc
+++ b/tests/forces/Test_wilson_force_phiMdagMphi.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {
--- a/tests/forces/Test_wilson_force_phiMphi.cc
+++ b/tests/forces/Test_wilson_force_phiMphi.cc
@@ -31,7 +31,7 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
-#define parallel_for PARALLEL_FOR_LOOP for
+ 
 int main (int argc, char ** argv)
 {