From ac569653060c8249f1d81909660b4ae04cc8e79d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Jan 2018 13:28:30 +0000
Subject: [PATCH] GPU changes and threading macros replaced

---
 lib/algorithms/FFT.h                          | 26 ++++++++-----------
 lib/algorithms/LinearOperator.h               |  2 +-
 lib/algorithms/approx/Chebyshev.h             |  2 +-
 lib/algorithms/approx/Forecast.h              |  8 +++---
 lib/algorithms/approx/Remez.cc                |  3 +--
 .../iterative/BlockConjugateGradient.h        | 13 +++++-----
 lib/algorithms/iterative/ConjugateGradient.h  |  6 ++---
 .../ConjugateGradientReliableUpdate.h         |  8 +++---
 lib/algorithms/iterative/ConjugateResidual.h  |  6 ++---
 .../iterative/ImplicitlyRestartedLanczos.h    |  6 ++---
 10 files changed, 38 insertions(+), 42 deletions(-)
diff --git a/lib/algorithms/FFT.h b/lib/algorithms/FFT.h
index e92540fb..e63ff123 100644
--- a/lib/algorithms/FFT.h
+++ b/lib/algorithms/FFT.h
@@ -232,19 +232,17 @@ public:
     result = source;
     int pc = processor_coor[dim];
     for(int p=0;p<processors[dim];p++) {
-      PARALLEL_REGION
-        {
+      thread_region {
           std::vector<int> cbuf(Nd);
           sobj s;
           
-          PARALLEL_FOR_LOOP_INTERN
-	    for(int idx=0;idx<sgrid->lSites();idx++) {
+          thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), {
 	      sgrid->LocalIndexToLocalCoor(idx,cbuf);
 	      peekLocalSite(s,result,cbuf);
 	      cbuf[dim]+=((pc+p) % processors[dim])*L;
 	      //            cbuf[dim]+=p*L;
 	      pokeLocalSite(s,pgbuf,cbuf);
-	    }
+	    } );
         }
       if (p != processors[dim] - 1)
         {
@@ -256,19 +254,18 @@ public:
     int NN=pencil_g.lSites();
     GridStopWatch timer;
     timer.Start();
-    PARALLEL_REGION
-      {
+    thread_region {
+
         std::vector<int> cbuf(Nd);
         
-        PARALLEL_FOR_LOOP_INTERN
-	  for(int idx=0;idx<NN;idx++) {
+        thread_loop_in_region( (int idx=0;idx<NN;idx++), {
 	    pencil_g.LocalIndexToLocalCoor(idx, cbuf);
 	    if ( cbuf[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
 	      FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
 	      FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
 	      FFTW<scalar>::fftw_execute_dft(p,in,out);
 	    }
-	  }
+	  });
       }
     timer.Stop();
       
@@ -280,19 +277,18 @@ public:
     flops+= flops_call*NN;
       
     // writing out result
-    PARALLEL_REGION
-      {
+    thread_region {
+
         std::vector<int> clbuf(Nd), cgbuf(Nd);
         sobj s;
         
-        PARALLEL_FOR_LOOP_INTERN
-	  for(int idx=0;idx<sgrid->lSites();idx++) {
+        thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), {
 	    sgrid->LocalIndexToLocalCoor(idx,clbuf);
 	    cgbuf = clbuf;
 	    cgbuf[dim] = clbuf[dim]+L*pc;
 	    peekLocalSite(s,pgbuf,cgbuf);
 	    pokeLocalSite(s,result,clbuf);
-	  }
+	});
       }
     result = result*div;
       
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index 4bc169eb..968d0097 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);
 // LinearOperators Take a something and return a something.
 /////////////////////////////////////////////////////////////////////////////////////////////
 //
-// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
+// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
 //SBase
 //   i)  F(a x + b y) = aF(x) + b F(y).
 //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h
index 9ef468e4..d36d7363 100644
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -55,7 +55,7 @@ private:
 public:
   void csv(std::ostream &out){
     RealD diff = hi-lo;
-    RealD delta = (hi-lo)*1.0e-9;
+    RealD delta = diff*1.0e-9;
     for (RealD x=lo; x<hi; x+=delta) {
       delta*=1.1;
       RealD f = approx(x);
diff --git a/lib/algorithms/approx/Forecast.h b/lib/algorithms/approx/Forecast.h
index 843e7bcc..b9d73562 100644
--- a/lib/algorithms/approx/Forecast.h
+++ b/lib/algorithms/approx/Forecast.h
@@ -60,7 +60,7 @@ public:
     if(degree == 0){ chi = zero; return chi; }
     else if(degree == 1){ return prev_solns[0]; }
 
-    RealD dot;
+    //    RealD dot;
     ComplexD xp;
     Field r(phi); // residual
     Field Mv(phi);
@@ -92,7 +92,7 @@ public:
     for(int j=0; j<degree; j++){
       for(int k=j+1; k<degree; k++){
 	G[j][k] = innerProduct(v[j],MdagMv[k]);
-	G[k][j] = std::conj(G[j][k]);
+	G[k][j] = conjugate(G[j][k]);
       }}
 
     // Gauss-Jordan elimination with partial pivoting
@@ -100,7 +100,7 @@ public:
 
       // Perform partial pivoting
       int k = i;
-      for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
+      for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
       if(k != i){
 	xp = b[k];
 	b[k] = b[i];
@@ -136,7 +136,7 @@ public:
     for(int i=0; i<degree; i++){
       tmp = -b[i];
       for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
-      tmp = std::conj(tmp)*tmp;
+      tmp = conjugate(tmp)*tmp;
       true_r += std::sqrt(tmp.real());
     }
 
diff --git a/lib/algorithms/approx/Remez.cc b/lib/algorithms/approx/Remez.cc
index ca00a330..6fbaadc9 100644
--- a/lib/algorithms/approx/Remez.cc
+++ b/lib/algorithms/approx/Remez.cc
@@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
 // Search for error maxima and minima
 void AlgRemez::search(bigfloat *step) {
   bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
-  int i, j, meq, emsign, ensign, steps;
+  int i, meq, emsign, ensign, steps;
 
   meq = neq + 1;
   bigfloat *yy = new bigfloat[meq];
@@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
   bigfloat eclose = 1.0e30;
   bigfloat farther = 0l;
 
-  j = 1;
   xx0 = apstrt;
 
   for (i = 0; i < meq; i++) {
diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 686de545..b2a268b1 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -30,13 +30,14 @@ directory
 #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
 #define GRID_BLOCK_CONJUGATE_GRADIENT_H
 
+#include <Grid/lattice/Lattice_matrix_reduction.h>
 
 NAMESPACE_BEGIN(Grid);
 
 enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
 
 //////////////////////////////////////////////////////////////////////////
-// Block conjugate gradient. Dimension zero should be the block direction
+// Block Conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
 template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
@@ -178,7 +179,7 @@ public:
     for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
 
     /************************************************************************
-     * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+     * Block Conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
      ************************************************************************
      * Dimensions:
      *
@@ -276,7 +277,7 @@ public:
 
       for(int b=0;b<Nblock;b++) {
 	rrsum+=real(m_rr(b,b));
-	rr = real(m_rr(b,b))/ssq[b];
+	rr    =real(m_rr(b,b))/ssq[b];
 	if ( rr > max_resid ) max_resid = rr;
       }
 
@@ -317,7 +318,7 @@ public:
     IterationsToComplete = k;
   }
   //////////////////////////////////////////////////////////////////////////
-  // Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
+  // Block Conjugate gradient; Original O'Leary Dimension zero should be the block direction
   //////////////////////////////////////////////////////////////////////////
   void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
   {
@@ -360,7 +361,7 @@ public:
   
 
     /************************************************************************
-     * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
+     * Block Conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
      ************************************************************************
      * O'Leary : R = B - A X
      * O'Leary : P = M R ; preconditioner M = 1
@@ -463,7 +464,7 @@ public:
     IterationsToComplete = k;
   }
   //////////////////////////////////////////////////////////////////////////
-  // multiRHS conjugate gradient. Dimension zero should be the block direction
+  // multiRHS Conjugate gradient. Dimension zero should be the block direction
   // Use this for spread out across nodes
   //////////////////////////////////////////////////////////////////////////
   void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index 24d69e38..f195f735 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -135,12 +135,12 @@ public:
         Linop.HermOpAndNorm(psi, mmp, d, qq);
         p = mmp - src;
 
-        RealD srcnorm = sqrt(norm2(src));
-        RealD resnorm = sqrt(norm2(p));
+        RealD srcnorm = std::sqrt(norm2(src));
+        RealD resnorm = std::sqrt(norm2(p));
         RealD true_residual = resnorm / srcnorm;
 
         std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
-        std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+        std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 
diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
index 7acc6af8..9f46912e 100644
--- a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -110,7 +110,7 @@ public:
     // Check if guess is really REALLY good :)
     if (cp <= rsq) {
       std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
-      std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+      std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
       return;
     }
 
@@ -180,12 +180,12 @@ public:
 	Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 	p = mmp - src;
 
-	RealD srcnorm = sqrt(norm2(src));
-	RealD resnorm = sqrt(norm2(p));
+	RealD srcnorm = std::sqrt(norm2(src));
+	RealD resnorm = std::sqrt(norm2(p));
 	RealD true_residual = resnorm / srcnorm;
 
 	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
-	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
 	std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 	std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 
diff --git a/lib/algorithms/iterative/ConjugateResidual.h b/lib/algorithms/iterative/ConjugateResidual.h
index ca5c357b..28b1a2d2 100644
--- a/lib/algorithms/iterative/ConjugateResidual.h
+++ b/lib/algorithms/iterative/ConjugateResidual.h
@@ -49,7 +49,7 @@ public:
 
   void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
 
-    RealD a, b, c, d;
+    RealD a, b; // c, d;
     RealD cp, ssq,rsq;
       
     RealD rAr, rAAr, rArp;
@@ -95,8 +95,8 @@ public:
 	axpy(r,-1.0,src,Ap);
 	RealD true_resid = norm2(r)/ssq;
 	std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
-		 << " computed residual "<<sqrt(cp/ssq)
-		 << " true residual "<<sqrt(true_resid)
+		 << " computed residual "<<std::sqrt(cp/ssq)
+		 << " true residual "<<std::sqrt(true_resid)
 		 << " target "       <<Tolerance <<std::endl;
 	return;
       }
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 25b165b4..897eb52b 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -299,7 +299,7 @@ public:
   template<typename T>  static RealD normalise(T& v) 
   {
     RealD nn = norm2(v);
-    nn = sqrt(nn);
+    nn = std::sqrt(nn);
     v = v * (1.0/nn);
     return nn;
   }
@@ -464,7 +464,7 @@ until convergence
       f *= Qt(k2-1,Nm-1);
       f += lme[k2-1] * evec[k2];
       beta_k = norm2(f);
-      beta_k = sqrt(beta_k);
+      beta_k = std::sqrt(beta_k);
       std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
 	  
       RealD betar = 1.0/beta_k;
@@ -817,7 +817,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
     
     // determination of 2x2 leading submatrix
     RealD dsub = lmd[kmax-1]-lmd[kmax-2];
-    RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+    RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
     RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
     // (Dsh: shift)