diff --git a/lib/algorithms/CoarsenedMatrix.h b/lib/algorithms/CoarsenedMatrix.h
index 08df1978..9adc2e68 100644
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -128,9 +128,9 @@ public:
     for(int i=0;i<nbasis;i++){
       blockProject(iProj,subspace[i],subspace);
       eProj=Zero(); 
-      parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
+      thread_loop( (int ss=0;ss<CoarseGrid->oSites();ss++),{
 	eProj[ss](i)=CComplex(1.0);
-      }
+      });
       eProj=eProj - iProj;
       std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
     }
@@ -272,7 +272,7 @@ public:
     SimpleCompressor<siteVector> compressor;
     Stencil.HaloExchange(in,compressor);
 
-    parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+    thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
       siteVector res = Zero();
       siteVector nbr;
       int ptype;
@@ -291,7 +291,7 @@ public:
 	res = res + A[point][ss]*nbr;
       }
       vstream(out[ss],res);
-    }
+    });
     return norm2(out);
   };
 
@@ -384,14 +384,14 @@ public:
 	Subspace.ProjectToSubspace(oProj,oblock);
 	//	  blockProject(iProj,iblock,Subspace.subspace);
 	//	  blockProject(oProj,oblock,Subspace.subspace);
-	parallel_for(int ss=0;ss<Grid()->oSites();ss++){
+	thread_loop( (int ss=0;ss<Grid()->oSites();ss++),{
 	  for(int j=0;j<nbasis;j++){
 	    if( disp!= 0 ) {
 	      A[p][ss](j,i) = oProj[ss](j);
 	    }
 	    A[self_stencil][ss](j,i) =	A[self_stencil][ss](j,i) + iProj[ss](j);
 	  }
-	}
+	});
       }
     }
 
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index c7736c82..e1671550 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -55,11 +55,11 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
   typedef typename Field::vector_object vobj;
   GridBase* grid = basis[0].Grid();
       
-  parallel_region
+  thread_region
   {
     std::vector < vobj > B(Nm); // Thread private
         
-    parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
+    thread_loop_in_region( (int ss=0;ss < grid->oSites();ss++),{
       for(int j=j0; j<j1; ++j) B[j]=0.;
       
       for(int j=j0; j<j1; ++j){
@@ -70,7 +70,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
       for(int j=j0; j<j1; ++j){
 	  basis[j][ss] = B[j];
       }
-    }
+    });
   }
 }
 
@@ -82,13 +82,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
   GridBase* grid = basis[0].Grid();
 
   result.Checkerboard() = basis[0].Checkerboard();
-  parallel_for(int ss=0;ss < grid->oSites();ss++){
+  thread_loop( (int ss=0;ss < grid->oSites();ss++),{
     vobj B = Zero();
     for(int k=k0; k<k1; ++k){
       B +=Qt(j,k) * basis[k][ss];
     }
     result[ss] = B;
-  }
+  });
 }
 
 template<class Field>
diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index 778855ef..efec7c67 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -48,7 +48,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
 
   int stride=rhs.Grid()->_slice_stride[dimension];
   if ( cbmask == 0x3 ) { 
-    thread_loop_collapse( (int n=0;n<e1;n++) , 
+    thread_loop_collapse( 2, (int n=0;n<e1;n++) , 
       for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
@@ -92,7 +92,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
   int n1=rhs.Grid()->_slice_stride[dimension];
 
   if ( cbmask ==0x3){
-    thread_loop_collapse( (int n=0;n<e1;n++), {
+    thread_loop_collapse( 2, (int n=0;n<e1;n++), {
       for(int b=0;b<e2;b++){
 
 	int o      =   n*n1;
@@ -108,7 +108,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
     // Case of SIMD split AND checker dim cannot currently be hit, except in 
     // Test_cshift_red_black code.
     std::cout << " Dense packed buffer WARNING " <<std::endl;
-    thread_loop_collapse( (int n=0;n<e1;n++),{
+    thread_loop_collapse( 2, (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
 
 	int o=n*n1;
@@ -142,7 +142,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
   int stride=rhs.Grid()->_slice_stride[dimension];
   
   if ( cbmask ==0x3 ) {
-    thread_loop_collapse( (int n=0;n<e1;n++),{
+    thread_loop_collapse( 2, (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
 	int o   =n*rhs.Grid()->_slice_stride[dimension];
 	int bo  =n*rhs.Grid()->_slice_block[dimension];
@@ -184,7 +184,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
   int e2=rhs.Grid()->_slice_block[dimension];
 
   if(cbmask ==0x3 ) {
-    thread_loop_collapse( (int n=0;n<e1;n++),{
+    thread_loop_collapse(2, (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
 	int o      = n*rhs.Grid()->_slice_stride[dimension];
 	int offset = b+n*rhs.Grid()->_slice_block[dimension];
@@ -228,7 +228,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   int e2=rhs.Grid()->_slice_block[dimension];
   int stride = rhs.Grid()->_slice_stride[dimension];
   if(cbmask == 0x3 ){
-    thread_loop_collapse( (int n=0;n<e1;n++),{
+    thread_loop_collapse( 2,(int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
         int o =n*stride+b;
   	//lhs[lo+o]=rhs[ro+o];
@@ -236,7 +236,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
       }
     });
   } else { 
-    thread_loop_collapse( (int n=0;n<e1;n++),{
+    thread_loop_collapse(2, (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
         int o =n*stride+b;
         int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
@@ -266,7 +266,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   int e2=rhs.Grid()->_slice_block [dimension];
   int stride = rhs.Grid()->_slice_stride[dimension];
 
-  thread_loop_collapse( (int n=0;n<e1;n++),{
+  thread_loop_collapse(2, (int n=0;n<e1;n++),{
     for(int b=0;b<e2;b++){
 
       int o  =n*stride;
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index b39f43fb..259fdddd 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -48,7 +48,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
   
   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
   
-  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
     int mywork, myoff;
     GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
     
@@ -57,7 +57,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
       vnrm = vnrm + innerProductD(left[ss],right[ss]);
     }
     sumarray[thr]=TensorRemove(vnrm) ;
-  }
+  });
   
   vector_type vvnrm; vvnrm=Zero();  // sum across threads
   for(int i=0;i<grid->SumArraySize();i++){
@@ -104,7 +104,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
     sumarray[i]=Zero();
   }
   
-  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
     int mywork, myoff;
     GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
     
@@ -113,7 +113,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
       vvsum = vvsum + arg[ss];
     }
     sumarray[thr]=vvsum;
-  }
+  });
   
   vobj vsum=Zero();  // sum across threads
   for(int i=0;i<grid->SumArraySize();i++){
@@ -172,8 +172,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
   int stride=grid->_slice_stride[orthogdim];
 
   // sum over reduced dimension planes, breaking out orthog dir
-  // Parallel over orthog direction
-  parallel_for(int r=0;r<rd;r++){
+  thread_loop( (int r=0;r<rd;r++),{
 
     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 
@@ -183,7 +182,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 	lvSum[r]=lvSum[r]+Data[ss];
       }
     }
-  }
+  });
 
   // Sum across simd lanes in the plane, breaking out orthog dir.
   std::vector<int> icoor(Nd);
@@ -252,7 +251,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
   int e2=    grid->_slice_block [orthogdim];
   int stride=grid->_slice_stride[orthogdim];
 
-  parallel_for(int r=0;r<rd;r++){
+  thread_loop( (int r=0;r<rd;r++),{
 
     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 
@@ -263,7 +262,7 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
 	lvSum[r]=lvSum[r]+vv;
       }
     }
-  }
+  });
 
   // Sum across simd lanes in the plane, breaking out orthog dir.
   std::vector<int> icoor(Nd);
@@ -359,12 +358,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 
     tensor_reduced at; at=av;
 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    thread_loop_collapse(2, (int n=0;n<e1;n++),{
       for(int b=0;b<e2;b++){
 	int ss= so+n*stride+b;
 	R[ss] = at*X[ss]+Y[ss];
       }
-    }
+    });
   }
 };
 
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
index 9e056933..5f006921 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -53,7 +53,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
   M5Dcalls++;
   M5Dtime-=usecond();
 
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     for(int s=0;s<Ls;s++){
       auto tmp = psi[0];
       if ( s==0 ) {
@@ -76,7 +76,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
       }
     }
-  }
+  });
   M5Dtime+=usecond();
 }
 
@@ -97,7 +97,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
   M5Dcalls++;
   M5Dtime-=usecond();
 
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     auto tmp = psi[0];
     for(int s=0;s<Ls;s++){
       if ( s==0 ) {
@@ -120,7 +120,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 	chi[ss+s]=chi[ss+s]+lower[s]*tmp;
       }
     }
-  }
+  });
   M5Dtime+=usecond();
 }
 
@@ -135,7 +135,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
   MooeeInvCalls++;
   MooeeInvTime-=usecond();
 
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     auto tmp = psi[0];
 
     // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
@@ -163,7 +163,7 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
       spProj5m(tmp,chi[ss+s+1]);  
       chi[ss+s] = chi[ss+s] - uee[s]*tmp;
     }
-  }
+  });
 
   MooeeInvTime+=usecond();
 
@@ -193,7 +193,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
   MooeeInvCalls++;
   MooeeInvTime-=usecond();
 
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
 
     auto tmp = psi[0];
 
@@ -221,7 +221,7 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
       spProj5p(tmp,chi[ss+s+1]);
       chi[ss+s] = chi[ss+s] - leec[s]*tmp;
     }
-  }
+  });
 
   MooeeInvTime+=usecond();
 
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index eaed9b9c..04732567 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -93,7 +93,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 
   assert(Nc==3);
 
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
     alignas(64) SiteHalfSpinor hp;
     alignas(64) SiteHalfSpinor hm;
@@ -190,7 +190,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 
     }
 #endif
-  }
+  });
   M5Dtime+=usecond();
 }
 
@@ -233,7 +233,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 
   M5Dcalls++;
   M5Dtime-=usecond();
-  parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
 #if 0
     alignas(64) SiteHalfSpinor hp;
     alignas(64) SiteHalfSpinor hm;
@@ -327,7 +327,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
       vstream(chi[ss+v]()(3)(2),p_32);
     }
 #endif
-  }
+  });
   M5Dtime+=usecond();
 }
 
@@ -792,13 +792,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
   MooeeInvTime-=usecond();
 
   if ( switcheroo<Coeff_t>::iscomplex() ) {
-    parallel_for(auto site=0;site<vol;site++){
+    thread_loop( (auto site=0;site<vol;site++),{
       MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
+    });
   } else { 
-    parallel_for(auto site=0;site<vol;site++){
+    thread_loop( (auto site=0;site<vol;site++),{
       MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
-    }
+    });
   }
   MooeeInvTime+=usecond();
 }
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc b/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
index 89273674..7fa9ce9e 100644
--- a/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
@@ -52,7 +52,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
     for(int s=0; s<Ls; s++){
       auto tmp = psi[0];
       if(s==0) {
@@ -72,7 +72,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
 	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
       }
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -90,7 +90,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
     auto tmp = psi[0];
     for(int s=0; s<Ls; s++){
       if(s==0) {
@@ -110,7 +110,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
 	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
       }
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -126,7 +126,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
 
     auto tmp1 = psi[0];
     auto tmp2 = psi[0];
@@ -158,7 +158,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField
       spProj5m(tmp1, chi[ss+s+1]);
       chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
@@ -190,7 +190,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionFi
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
 
     auto tmp1 = psi[0];
     auto tmp2 = psi[0];
@@ -221,7 +221,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionFi
       spProj5p(tmp1, chi[ss+s+1]);
       chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
index 3952ce9d..3d24999c 100644
--- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
@@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
 
   assert(Nc == 3);
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
 #if 0
 
@@ -191,7 +191,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionFiel
     }
 
 #endif
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -232,7 +232,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
 #if 0
 
@@ -330,7 +330,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionF
     }
 #endif
 
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -566,13 +566,13 @@ void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, Fermion
   this->MooeeInvTime -= usecond();
 
   if(switcheroo<Coeff_t>::iscomplex()){
-    parallel_for(auto site=0; site<vol; site++){
+    thread_loop((auto site=0; site<vol; site++),{
       MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    }
+    });
   } else {
-    parallel_for(auto site=0; site<vol; site++){
+    thread_loop((auto site=0; site<vol; site++){
       MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    }
+    });
   }
 
   this->MooeeInvTime += usecond();
diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 99a66969..f4748eee 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -268,13 +268,13 @@ public:
     GaugeLinkField tmp(mat.Grid());
     tmp = Zero();
       
-    parallel_for(int sss=0;sss<tmp.Grid()->oSites();sss++){
+    thread_loop( (int sss=0;sss<tmp.Grid()->oSites();sss++),{
       int sU=sss;
       for(int s=0;s<Ls;s++){
 	int sF = s+Ls*sU;
 	tmp[sU] = tmp[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde[sF],Atilde[sF])); // ordering here
       }
-    }
+    });
     PokeIndex<LorentzIndex>(mat,tmp,mu);
       
   }
@@ -591,10 +591,10 @@ public:
 	Uconj = where(coor==neglink,-Uconj,Uconj);
       }
 	  
-      parallel_for(auto ss=U.begin();ss<U.end();ss++){
+      thread_loop( (auto ss=U.begin();ss<U.end();ss++),{
 	Uds[ss](0)(mu) = U[ss]();
 	Uds[ss](1)(mu) = Uconj[ss]();
-      }
+      });
           
       U     = adj(Cshift(U    ,mu,-1));      // correct except for spanning the boundary
       Uconj = adj(Cshift(Uconj,mu,-1));
@@ -605,18 +605,18 @@ public:
       }
 
 	  
-      parallel_for(auto ss=U.begin();ss<U.end();ss++){
+      thread_loop((auto ss=U.begin();ss<U.end();ss++),{
 	Uds[ss](0)(mu+4) = Utmp[ss]();
-      }
+      });
           
       Utmp = Uconj;
       if ( Params.twists[mu] ) { 
 	Utmp = where(coor==0,U,Utmp);
       }
 	  
-      parallel_for(auto ss=U.begin();ss<U.end();ss++){
-	Uds[ss](1)(mu+4) = Utmp[ss]();
-      }
+      thread_loop((auto ss=U.begin();ss<U.end();ss++),{
+        Uds[ss](1)(mu+4) = Utmp[ss]();
+      });
           
     }
   }
@@ -627,9 +627,9 @@ public:
     GaugeLinkField link(mat.Grid());
     // use lorentz for flavour as hack.
     auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
-    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
+    thread_loop((auto ss = tmp.begin(); ss < tmp.end(); ss++), {
       link[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
-    }
+    });
     PokeIndex<LorentzIndex>(mat, link, mu);
     return;
   }
@@ -640,13 +640,13 @@ public:
         
     GaugeLinkField tmp(mat.Grid());
     tmp = Zero();
-    parallel_for(int ss = 0; ss < tmp.Grid()->oSites(); ss++) {
+    thread_loop((int ss = 0; ss < tmp.Grid()->oSites(); ss++) ,{
       for (int s = 0; s < Ls; s++) {
 	int sF = s + Ls * ss;
 	auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF]));
 	tmp[ss]() = tmp[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
       }
-    }
+    });
     PokeIndex<LorentzIndex>(mat, tmp, mu);
     return;
   }
diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
index 9dbb0246..a4b59783 100644
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -172,13 +172,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
   Compressor compressor;
   Stencil.HaloExchange(in,compressor);
 
-  parallel_for(int ss=0;ss<Umu.Grid()->oSites();ss++){
+  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
     for(int s=0;s<Ls;s++){
       int sU=ss;
       int sF = s+Ls*sU; 
       Kernels::DhopDirK(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
     }
-  }
+  });
 };
 
 template<class Impl>
@@ -240,15 +240,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
   DhopComputeTime -= usecond();
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
   if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++), {
       int sU=ss;
       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
-    }
+    });
   } else {
-    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    thread_loop(  (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
       int sU=ss;
       Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
-    }
+    });
   }
   DhopComputeTime += usecond();
   DhopTotalTime   += usecond();
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc b/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
index bdbc6fe0..c730e91b 100644
--- a/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
@@ -50,7 +50,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &p
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
     for(int s=0; s<Ls; s++){
       auto tmp = psi[0];
       if(s==0){
@@ -70,7 +70,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &p
 	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
       }
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -91,7 +91,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionFi
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
     for(int s=0; s<Ls; s++){
       auto tmp = psi[0];
       if(s==0){
@@ -114,7 +114,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionFi
       else{ spProj5m(tmp, psi[ss+shift_s]); }
       chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -133,7 +133,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
     auto tmp = psi[0];
     for(int s=0; s<Ls; s++){
       if(s==0) {
@@ -153,7 +153,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField
 	chi[ss+s] = chi[ss+s] + lower[s]*tmp;
       }
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -174,7 +174,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const Fermio
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
     chi[ss+Ls-1] = Zero();
     auto tmp = psi[0];
     for(int s=0; s<Ls; s++){
@@ -198,7 +198,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const Fermio
       else{ spProj5m(tmp, psi[ss+s]); }
       chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -216,7 +216,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &ch
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
 
     auto tmp = psi[0];
 
@@ -245,7 +245,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &ch
       spProj5m(tmp, chi[ss+s+1]);
       chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
@@ -261,7 +261,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionFie
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
 
     auto tmp1        = psi[0];
     auto tmp2        = psi[0];
@@ -300,7 +300,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionFie
       spProj5m(tmp1, chi[ss+s]);
       chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
@@ -318,7 +318,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
 
     auto tmp = psi[0];
 
@@ -347,7 +347,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField
       spProj5p(tmp, chi[ss+s+1]);
       chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
@@ -363,7 +363,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, Fermion
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
 
     auto tmp1        = psi[0];
     auto tmp2        = psi[0];
@@ -401,7 +401,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, Fermion
       spProj5p(tmp1, chi[ss+s]);
       chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
     }
-  }
+  });
 
   this->MooeeInvTime += usecond();
 }
diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
index f944b3be..330dab36 100644
--- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
+++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
@@ -100,7 +100,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& p
 
   assert(Nc == 3);
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
 #if 0
 
@@ -202,7 +202,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& p
     }
 
 #endif
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -263,7 +263,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionFi
 
   assert(Nc == 3);
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
     int vs     = (this->pm == 1) ? LLs-1 : 0;
     Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
@@ -381,7 +381,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionFi
       vstream(chi[ss+v]()(3)(1), p_31);
       vstream(chi[ss+v]()(3)(2), p_32);
     }
-  }
+  });
 
   this->M5Dtime += usecond();
 
@@ -424,7 +424,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
 #if 0
 
@@ -525,7 +525,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField
 
 #endif
 
-  }
+  });
 
   this->M5Dtime += usecond();
 }
@@ -584,7 +584,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const Fermio
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
 
-  parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
+  thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
 
     int vs     = (this->pm == 1) ? LLs-1 : 0;
     Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
@@ -703,7 +703,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const Fermio
 
     }
 
-  }
+  });
 
   this->M5Dtime += usecond();
 
@@ -943,13 +943,13 @@ void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionFiel
   this->MooeeInvTime -= usecond();
 
   if(switcheroo<Coeff_t>::iscomplex()){
-    parallel_for(auto site=0; site<vol; site++){
+    thread_loop( (auto site=0; site<vol; site++),{
       MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    }
+    });
   } else {
-    parallel_for(auto site=0; site<vol; site++){
+    thread_loop( (auto site=0; site<vol; site++),{
       MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
-    }
+    });
   }
 
   this->MooeeInvTime += usecond();
diff --git a/lib/qcd/action/fermion/SchurDiagTwoKappa.h b/lib/qcd/action/fermion/SchurDiagTwoKappa.h
index a8de7632..c6e5470b 100644
--- a/lib/qcd/action/fermion/SchurDiagTwoKappa.h
+++ b/lib/qcd/action/fermion/SchurDiagTwoKappa.h
@@ -54,10 +54,10 @@ public:
     out.Checkerboard() = in.Checkerboard();
     assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
     int Ls = grid->_rdimensions[0];
-    parallel_for(int ss=0;ss<grid->oSites();ss++){
+    thread_loop( (int ss=0;ss<grid->oSites();ss++),{
       vobj tmp = s[ss % Ls]*in[ss];
       vstream(out[ss],tmp);
-    }
+    });
   }
 
   RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc
index a398a7ff..4d7d340c 100644
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -216,9 +216,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
     ////////////////////////
     // Call the single hop
     ////////////////////////
-    parallel_for (int sss = 0; sss < B.Grid()->oSites(); sss++) {
+    thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{
       Kernels::DhopDirK(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
-    }
+    });
 
     //////////////////////////////////////////////////
     // spin trace outer product
@@ -317,9 +317,9 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
 
   Stencil.HaloExchange(in, compressor);
 
-  parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
+  thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
     Kernels::DhopDirK(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
-  }
+  });
 };
 
 template <class Impl>
@@ -333,13 +333,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
   st.HaloExchange(in, compressor);
 
   if (dag == DaggerYes) {
-    parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
+    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
       Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    });
   } else {
-    parallel_for (int sss = 0; sss < in.Grid()->oSites(); sss++) {
+    thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
       Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
-    }
+    });
   }
 };
 
@@ -366,8 +366,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
   // Inefficient comms method but not performance critical.
   tmp1 = Cshift(q_in_1, mu, 1);
   tmp2 = Cshift(q_in_2, mu, 1);
-  parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
-    {
+  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
       Kernels::ContractConservedCurrentSiteFwd(tmp1[sU],
 					       q_in_2[sU],
 					       q_out[sU],
@@ -376,7 +375,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 					       tmp2[sU],
 					       q_out[sU],
 					       Umu, sU, mu);
-    }
+  });
 }
 
 template <class Impl>
@@ -415,38 +414,36 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   tmp = ph*q_in;
   tmpBwd = Cshift(tmp, mu, -1);
 
-  parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
-    {
-      // Compute the sequential conserved current insertion only if our simd
-      // object contains a timeslice we need.
-      vInteger t_mask   = ((coords[sU] >= tmin) &&
-			   (coords[sU] <= tmax));
-      Integer timeSlices = Reduce(t_mask);
+  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
 
-      if (timeSlices > 0)
-        {
-	  Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU], 
-					      q_out[sU], 
-					      Umu, sU, mu, t_mask);
-        }
+    // Compute the sequential conserved current insertion only if our simd
+    // object contains a timeslice we need.
+    vInteger t_mask   = ((coords[sU] >= tmin) &&
+			 (coords[sU] <= tmax));
+    Integer timeSlices = Reduce(t_mask);
 
-      // Repeat for backward direction.
-      t_mask     = ((coords[sU] >= (tmin + tshift)) && 
-		    (coords[sU] <= (tmax + tshift)));
-
-      //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-      unsigned int t0 = 0;
-      if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
-
-      timeSlices = Reduce(t_mask);
-
-      if (timeSlices > 0)
-        {
-	  Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU], 
-					      q_out[sU], 
-					      Umu, sU, mu, t_mask);
-        }
+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sU], 
+					  q_out[sU], 
+					  Umu, sU, mu, t_mask);
     }
+
+    // Repeat for backward direction.
+    t_mask     = ((coords[sU] >= (tmin + tshift)) && 
+		  (coords[sU] <= (tmax + tshift)));
+    
+    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+    unsigned int t0 = 0;
+    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
+    
+    timeSlices = Reduce(t_mask);
+
+    if (timeSlices > 0) {
+      Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sU], 
+					  q_out[sU], 
+					  Umu, sU, mu, t_mask);
+    }
+  });
 }
 
 FermOpTemplateInstantiate(WilsonFermion);
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index da62a633..f32df1fd 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -244,13 +244,13 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
   assert(dirdisp<=7);
   assert(dirdisp>=0);
 
-  parallel_for(int ss=0;ss<Umu.Grid()->oSites();ss++){
+  thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
     for(int s=0;s<Ls;s++){
       int sU=ss;
       int sF = s+Ls*sU; 
       Kernels::DhopDirK(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
     }
-  }
+  });
 };
 
 template<class Impl>
@@ -293,7 +293,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
     ////////////////////////
 
     DerivDhopComputeTime -= usecond();
-    parallel_for (int sss = 0; sss < U.Grid()->oSites(); sss++) {
+    thread_loop( (int sss = 0; sss < U.Grid()->oSites(); sss++) ,{
       for (int s = 0; s < Ls; s++) {
         int sU = sss;
         int sF = s + Ls * sU;
@@ -307,7 +307,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
         // spin trace outer product
         ////////////////////////////
       }
-    }
+    });
     ////////////////////////////
     // spin trace outer product
     ////////////////////////////
@@ -464,18 +464,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
   DhopComputeTime2-=usecond();
   if (dag == DaggerYes) {
     int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_loop( (int ss = 0; ss < sz; ss++) ,{
       int sU = st.surface_list[ss];
       int sF = LLs * sU;
       Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    });
   } else {
     int sz=st.surface_list.size();
-    parallel_for (int ss = 0; ss < sz; ss++) {
+    thread_loop( (int ss = 0; ss < sz; ss++) ,{
       int sU = st.surface_list[ss];
       int sF = LLs * sU;
       Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
-    }
+    });
   }
   DhopComputeTime2+=usecond();
 #else 
@@ -502,17 +502,17 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 
   if (dag == DaggerYes) {
-    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
       int sU = ss;
       int sF = LLs * sU;
       Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    });
   } else {
-    parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
+    thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
       int sU = ss;
       int sF = LLs * sU;
       Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
-    }
+    });
   }
   DhopComputeTime+=usecond();
 }
@@ -739,41 +739,37 @@ void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
   // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
   tmp1 = Cshift(q_in_1, mu + 1, 1);
   tmp2 = Cshift(q_in_2, mu + 1, 1);
-  parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
-    {
-      unsigned int sF1 = sU * LLs;
-      unsigned int sF2 = (sU + 1) * LLs - 1;
+  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
+    unsigned int sF1 = sU * LLs;
+    unsigned int sF2 = (sU + 1) * LLs - 1;
+    
+    for (unsigned int s = 0; s < LLs; ++s) {
 
-      for (unsigned int s = 0; s < LLs; ++s)
-        {
-	  bool axial_sign = ((curr_type == Current::Axial) && \
-			     (s < (LLs / 2)));
-	  SitePropagator qSite2, qmuSite2;
-
-	  // If vectorised in 5th dimension, reverse q2 vector to match up
-	  // sites correctly.
-	  if (Impl::LsVectorised)
-            {
-	      REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs);
-	      REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs);
-            }
-	  else
-            {
-	      qSite2   = q_in_2[sF2];
-	      qmuSite2 = tmp2[sF2];
-            }
-	  Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1], 
-						   qSite2, 
-						   q_out[sU],
-						   Umu, sU, mu, axial_sign);
-	  Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1],
-						   qmuSite2,
-						   q_out[sU],
-						   Umu, sU, mu, axial_sign);
-	  sF1++;
-	  sF2--;
-        }
+      bool axial_sign = ((curr_type == Current::Axial) &&	\
+			 (s < (LLs / 2)));
+      SitePropagator qSite2, qmuSite2;
+      
+      // If vectorised in 5th dimension, reverse q2 vector to match up
+      // sites correctly.
+      if (Impl::LsVectorised) {
+	REVERSE_LS(q_in_2[sF2], qSite2, Ls / LLs);
+	REVERSE_LS(tmp2[sF2], qmuSite2, Ls / LLs);
+      } else {
+	qSite2   = q_in_2[sF2];
+	qmuSite2 = tmp2[sF2];
+      }
+      Kernels::ContractConservedCurrentSiteFwd(tmp1[sF1], 
+					       qSite2, 
+					       q_out[sU],
+					       Umu, sU, mu, axial_sign);
+      Kernels::ContractConservedCurrentSiteBwd(q_in_1[sF1],
+					       qmuSite2,
+					       q_out[sU],
+					       Umu, sU, mu, axial_sign);
+      sF1++;
+      sF2--;
     }
+  });
 }
 
 
@@ -817,50 +813,46 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   tmp = ph*q_in;
   tmpBwd = Cshift(tmp, mu + 1, -1);
 
-  parallel_for (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU)
-    {
-      // Compute the sequential conserved current insertion only if our simd
-      // object contains a timeslice we need.
-      vInteger t_mask   = ((coords[sU] >= tmin) &&
-			   (coords[sU] <= tmax));
-      Integer timeSlices = Reduce(t_mask);
+  thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU) ,{
+    // Compute the sequential conserved current insertion only if our simd
+    // object contains a timeslice we need.
+    vInteger t_mask   = ((coords[sU] >= tmin) &&
+			 (coords[sU] <= tmax));
+    Integer timeSlices = Reduce(t_mask);
+      
+    if (timeSlices > 0) {
 
-      if (timeSlices > 0)
-        {
-	  unsigned int sF = sU * LLs;
-	  for (unsigned int s = 0; s < LLs; ++s)
-            {
-	      bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-	      Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF], 
-						  q_out[sF], Umu, sU,
-						  mu, t_mask, axial_sign);
-	      ++sF;
-            }
-        }
-
-      // Repeat for backward direction.
-      t_mask     = ((coords[sU] >= (tmin + tshift)) && 
-		    (coords[sU] <= (tmax + tshift)));
-
-      //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-      unsigned int t0 = 0;
-      if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
-
-      timeSlices = Reduce(t_mask);
-
-      if (timeSlices > 0)
-        {
-	  unsigned int sF = sU * LLs;
-	  for (unsigned int s = 0; s < LLs; ++s)
-            {
-	      bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
-	      Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF], 
-						  q_out[sF], Umu, sU,
-						  mu, t_mask, axial_sign);
-	      ++sF;
-            }
-        }
+      unsigned int sF = sU * LLs;
+      for (unsigned int s = 0; s < LLs; ++s) {
+	bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+	Kernels::SeqConservedCurrentSiteFwd(tmpFwd[sF], 
+					    q_out[sF], Umu, sU,
+					    mu, t_mask, axial_sign);
+	++sF;
+      }
     }
+
+    // Repeat for backward direction.
+    t_mask     = ((coords[sU] >= (tmin + tshift)) && 
+		  (coords[sU] <= (tmax + tshift)));
+
+    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+    unsigned int t0 = 0;
+    if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords[sU] == t0 ));
+    
+    timeSlices = Reduce(t_mask);
+
+    if (timeSlices > 0) {
+      unsigned int sF = sU * LLs;
+      for (unsigned int s = 0; s < LLs; ++s) {
+	bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+	Kernels::SeqConservedCurrentSiteBwd(tmpBwd[sF], 
+					    q_out[sF], Umu, sU,
+					    mu, t_mask, axial_sign);
+	++sF;
+      }
+    }
+  });
 }
 
 FermOpTemplateInstantiate(WilsonFermion5D);
diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/lib/qcd/action/gauge/GaugeImplTypes.h
index 8a8ecf62..fbe60039 100644
--- a/lib/qcd/action/gauge/GaugeImplTypes.h
+++ b/lib/qcd/action/gauge/GaugeImplTypes.h
@@ -101,10 +101,10 @@ public:
     //static std::chrono::duration<double> diff;
 
     //auto start = std::chrono::high_resolution_clock::now();
-    parallel_for(int ss=0;ss<P.Grid()->oSites();ss++){
+    thread_loop( (int ss=0;ss<P.Grid()->oSites();ss++),{
       for (int mu = 0; mu < Nd; mu++) 
         U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
-    }
+    });
     
     //auto end = std::chrono::high_resolution_clock::now();
    // diff += end - start;
diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h
index d3ef28b3..244bee2e 100644
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -82,7 +82,7 @@ public:
     action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
     for (int mu = 0; mu < Ndim; mu++) {
       //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-      parallel_for (int i = 0; i < p.Grid()->oSites(); i++) {
+      thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
 	int permute_type;
 	StencilEntry *SE;
 	vobj temp2;
@@ -101,7 +101,7 @@ public:
 	} else {
 	  action[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 	}
-      }
+      });
       //  action -= pshift*p + p*pshift;
     }
     // NB the trace in the algebra is normalised to 1/2
@@ -118,7 +118,7 @@ public:
       
     //for (int mu = 0; mu < Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
     for (int point = 0; point < npoint; point++) {
-      parallel_for (int i = 0; i < p.Grid()->oSites(); i++) {
+      thread_loop( (int i = 0; i < p.Grid()->oSites(); i++) ,{
 	const vobj *temp;
 	vobj temp2;
 	int permute_type;
@@ -136,7 +136,7 @@ public:
 	} else {
 	  force[i] -= phiStencil.CommBuf()[SE->_offset];
 	}
-      }
+      });
     }
   }
 };
diff --git a/lib/qcd/utils/LinalgUtils.h b/lib/qcd/utils/LinalgUtils.h
index a1a40619..cc023faf 100644
--- a/lib/qcd/utils/LinalgUtils.h
+++ b/lib/qcd/utils/LinalgUtils.h
@@ -48,12 +48,12 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
   GridBase *grid=x.Grid();
 
   Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  thread_loop( (int ss=0;ss<grid->oSites();ss++),{
     vobj tmp;
     tmp = a*x[ss];
     tmp = tmp + G5*(b*timesI(x[ss]));
     vstream(z[ss],tmp);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -64,10 +64,10 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
   conformable(x,z);
   GridBase *grid=x.Grid();
   int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp = a*x[ss+s]+b*y[ss+sp];
     vstream(z[ss+s],tmp);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -80,12 +80,12 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
   int Ls = grid->_rdimensions[0];
 
   Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp;
     tmp = G5*x[ss+s]*a;
     tmp = tmp + b*y[ss+sp];
     vstream(z[ss+s],tmp);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -97,12 +97,12 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
   GridBase *grid=x.Grid();
   int Ls = grid->_rdimensions[0];
   Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp;
     tmp = G5*y[ss+sp]*b;
     tmp = tmp + a*x[ss+s];
     vstream(z[ss+s],tmp);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -115,13 +115,13 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
   int Ls = grid->_rdimensions[0];
 
   Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp1;
     vobj tmp2;
     tmp1 = a*x[ss+s]+b*y[ss+sp];
     tmp2 = G5*tmp1;
     vstream(z[ss+s],tmp2);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -132,12 +132,12 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
   conformable(x,z);
   GridBase *grid=x.Grid();
   int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp;
     spProj5m(tmp,y[ss+sp]);
     tmp = a*x[ss+s]+b*tmp;
     vstream(z[ss+s],tmp);
-  }
+  });
 }
 
 template<class vobj,class Coeff> 
@@ -148,12 +148,12 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
   conformable(x,z);
   GridBase *grid=x.Grid();
   int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
     vobj tmp;
     spProj5p(tmp,y[ss+sp]);
     tmp = a*x[ss+s]+b*tmp;
     vstream(z[ss+s],tmp);
-  }
+  });
 }
 
 template<class vobj> 
@@ -164,14 +164,14 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
   conformable(x,z);
   int Ls = grid->_rdimensions[0];
   Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls) {
+  thread_loop((int ss=0;ss<grid->oSites();ss+=Ls) {
     vobj tmp;
     for(int s=0;s<Ls;s++){
       int sp = Ls-1-s;
       tmp = G5*x[ss+s];
       vstream(z[ss+sp],tmp);
     }
-  }
+  });
 }
 
 NAMESPACE_END(Grid);
diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index c792c346..8b12297f 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -223,7 +223,7 @@ public:
     int i0, i1;
     su2SubGroupIndex(i0, i1, su2_index);
 
-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
+    thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{
       subgroup[ss]()()(0, 0) = source[ss]()()(i0, i0);
       subgroup[ss]()()(0, 1) = source[ss]()()(i0, i1);
       subgroup[ss]()()(1, 0) = source[ss]()()(i1, i0);
@@ -238,7 +238,7 @@ public:
       // this should be purely real
       Determinant[ss] =
 	Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
-    }
+    });
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////
@@ -253,12 +253,12 @@ public:
     su2SubGroupIndex(i0, i1, su2_index);
 
     dest = 1.0;  // start out with identity
-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
+    thread_loop( (int ss = 0; ss < grid->oSites(); ss++) ,{
       dest[ss]()()(i0, i0) = subgroup[ss]()()(0, 0);
       dest[ss]()()(i0, i1) = subgroup[ss]()()(0, 1);
       dest[ss]()()(i1, i0) = subgroup[ss]()()(1, 0);
       dest[ss]()()(i1, i1) = subgroup[ss]()()(1, 1);
-    }
+    });
   }
 
   ///////////////////////////////////////////////
diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h
index 87e29ae5..9812ab51 100644
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -63,9 +63,9 @@ template<class vobj,class cobj,class compressor>
 void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 {
   int num=table.size();
-  parallel_for(int i=0;i<num;i++){
+  thread_loop( (int i=0;i<num;i++), {
     compress.Compress(&buffer[off],table[i].first,rhs[so+table[i].second]);
-  }
+  });
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -83,10 +83,10 @@ void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const L
   assert( (table.size()&0x1)==0);
   int num=table.size()/2;
   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane 
-  parallel_for(int j=0;j<num;j++){
+  thread_loop( (int j=0;j<num;j++), {
     compress.CompressExchange(&pointers[0][0],&pointers[1][0],&rhs[0],
 			      j,so+table[2*j].second,so+table[2*j+1].second,type);
-  }
+  });
 }
 
 struct StencilEntry { 
@@ -330,20 +330,14 @@ public:
   }
   void Communicate(void)
   {
-#ifdef GRID_OMP
-#pragma omp parallel 
+    thread_region
     {
       // must be called in parallel region
-      int mythread  = omp_get_thread_num();
-      int maxthreads= omp_get_max_threads();
+      int mythread  = thread_num();
+      int maxthreads= thread_max();
       int nthreads = CartesianCommunicator::nCommThreads;
       assert(nthreads <= maxthreads);
-
       if (nthreads == -1) nthreads = 1;
-#else
-      int mythread = 0;
-      int nthreads = 1;
-#endif
       if (mythread < nthreads) {
 	for (int i = mythread; i < Packets.size(); i += nthreads) {
 	  double start = usecond();
@@ -355,9 +349,7 @@ public:
 	  comm_time_thr[mythread] += usecond() - start;
 	}
       }
-#ifdef GRID_OMP
     }
-#endif
   }
   
   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
@@ -509,20 +501,20 @@ public:
 
     for(int i=0;i<mm.size();i++){	
       mergetime-=usecond();
-      parallel_for(int o=0;o<mm[i].buffer_size/2;o++){
+      thread_loop( (int o=0;o<mm[i].buffer_size/2;o++), {
 	decompress.Exchange(mm[i].mpointer,
 			    mm[i].vpointers[0],
 			    mm[i].vpointers[1],
 			    mm[i].type,o);
-      }
+      });
       mergetime+=usecond();
     }
 
     for(int i=0;i<dd.size();i++){	
       decompresstime-=usecond();
-      parallel_for(int o=0;o<dd[i].buffer_size;o++){
+      thread_loop( (int o=0;o<dd[i].buffer_size;o++),{
 	decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
-      }      
+      });
       decompresstime+=usecond();
     }
 
diff --git a/lib/threads/Pragmas.h b/lib/threads/Pragmas.h
index d109bdc1..de35e68a 100644
--- a/lib/threads/Pragmas.h
+++ b/lib/threads/Pragmas.h
@@ -52,32 +52,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #define thread_loop( range , ... )           _Pragma("omp parallel for schedule(static)") for range { __VA_ARGS__ ; };
 #define thread_loop_in_region( range , ... ) _Pragma("omp for schedule(static)")          for range  { __VA_ARGS__ ; };
-#define thread_loop_collapse( range , ... )  _Pragma("omp parallel for collapse(2)")      for range  { __VA_ARGS__ };
+#define thread_loop_collapse( n, range , ... )  _Pragma("omp parallel for collapse(" #n ")")      for range  { __VA_ARGS__ };
 #define thread_region                         _Pragma("omp parallel")
 #define thread_critical                       _Pragma("omp critical")
+#define thread_num(a) omp_get_thread_num()
+#define thread_max(a) omp_get_max_threads()
 #else
 #define thread_loop( range , ... )            for range { __VA_ARGS__ ; };
 #define thread_loop_in_region( range , ... )  for range { __VA_ARGS__ ; };
-#define thread_loop_collapse( range , ... )   for range { __VA_ARGS__ ; };
+#define thread_loop_collapse( n, range , ... )   for range { __VA_ARGS__ ; };
 #define thread_region                           
 #define thread_critical                         
+#define thread_num(a) (0)
+#define thread_max(a) (1)
 #endif
 
-//////////////////////////////////////////////////////////////////////////////////
-// Deprecated primitives; to eridicate when done delete and fix errors.
-//////////////////////////////////////////////////////////////////////////////////
-
-#ifdef GRID_OMP
-#define parallel_for  _Pragma("omp parallel for schedule(static)") for
-#define parallel_for_internal  _Pragma("omp for schedule(static)") for
-#define parallel_region    thread_region
-#define parallel_for_nest2 for
-#else
-#define parallel_for           for
-#define parallel_for_internal  for
-#define parallel_region    
-#define parallel_for_nest2     for
-#endif
 
 //////////////////////////////////////////////////////////////////////////////////
 // Accelerator primitives; fall back to threading