Remove warnings under NVCC and move parallel_for to thread-loop

2025-10-19 07:24:44 +01:00 · 2019-01-01 15:08:09 +00:00
parent 0e9b591c1c
commit 802404c78c
7 changed files with 73 additions and 52 deletions
--- a/Hadrons/A2AMatrix.hpp
+++ b/Hadrons/A2AMatrix.hpp
@@ -170,7 +170,7 @@ public:
        if ((MatLeft::Options == Eigen::RowMajor) and
            (MatRight::Options == Eigen::ColMajor))
        {
-            parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+  	  thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
            {
                C tmp;
 #ifdef USE_MKL
@@ -178,15 +178,15 @@ public:
 #else
                tmp = a.row(r).conjugate().dot(b.col(r));
 #endif
-                parallel_critical
+                thread_critical
                {
                    acc += tmp;
                }
-            }
+            });
        }
        else
-        {
-            parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+	  {
+            thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
            {
                C tmp;
 #ifdef USE_MKL 
@@ -194,11 +194,11 @@ public:
 #else
                tmp = a.col(c).conjugate().dot(b.row(c));
 #endif
-                parallel_critical
+                thread_critical
                {
                    acc += tmp;
                }
-            }
+            });
        }
    }

@@ -646,14 +646,14 @@ void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
            bytes    += kernel.bytes(N_iii, N_jjj);

            START_TIMER("cache copy");
-            parallel_for_nest5(int e =0;e<next_;e++)
+            thread_loop_collapse( 5, (int e =0;e<next_;e++),
            for(int s =0;s< nstr_;s++)
            for(int t =0;t< nt_;t++)
            for(int iii=0;iii< N_iii;iii++)
            for(int jjj=0;jjj< N_jjj;jjj++)
            {
                mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
-            }
+            });
            STOP_TIMER("cache copy");
        }

--- a/Hadrons/Global.cc
+++ b/Hadrons/Global.cc
@@ -77,7 +77,7 @@ size_t Hadrons::typeHash(const std::type_info *info)
    return info->hash_code();
 }

-constexpr unsigned int maxNameSize = 1024u;
+//constexpr unsigned int maxNameSize = 1024u;

 std::string Hadrons::typeName(const std::type_info *info)
 {
--- a/Hadrons/Modules/MAction/ZMobiusDWF.hpp
+++ b/Hadrons/Modules/MAction/ZMobiusDWF.hpp
@@ -49,7 +49,7 @@ public:
                                    double                           , M5,
                                    double                           , b,
                                    double                           , c,
-                                    std::vector<std::complex<double>>, omega,
+                                    std::vector<std::complex<double> >, omega,
                                    std::string                      , boundary,
                                    std::string                      , twist);
 };
@@ -131,9 +131,28 @@ void TZMobiusDWF<FImpl>::setup(void)
    typename ZMobiusFermion<FImpl>::ImplParams implParams;
    implParams.boundary_phases = strToVec<Complex>(par().boundary);
    implParams.twist_n_2pi_L   = strToVec<Real>(par().twist);
-    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
-                     grb5, g4, grb4, par().mass, par().M5, omega,
-                     par().b, par().c, implParams);
+    
+    assert(par().Ls==omega.size());
+    int Ls=par().Ls;
+    std::vector<ComplexD> _omega(Ls);
+    for(int i=0;i<Ls;i++){
+      _omega[i] = omega[i];
+    }
+    envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls,
+		     U, g5, grb5, g4, grb4, 
+		     par().mass, par().M5, 
+		     _omega, par().b, par().c, implParams);
+
+    /*
+  ZMobiusFermion(GaugeField &_Umu,
+		 GridCartesian         &FiveDimGrid,
+		 GridRedBlackCartesian &FiveDimRedBlackGrid,
+		 GridCartesian         &FourDimGrid,
+		 GridRedBlackCartesian &FourDimRedBlackGrid,
+		 RealD _mass,RealD _M5,
+		 std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+    */
+
 }

 // execution ///////////////////////////////////////////////////////////////////
--- a/Hadrons/Modules/MNPR/FourQuark.hpp
+++ b/Hadrons/Modules/MNPR/FourQuark.hpp
@@ -140,8 +140,8 @@ void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &l
 	    auto  lret_v = lret.View();
 	    auto  a_v = a.View();
 	    auto  b_v = b.View();
-            parallel_for(auto site=lret_v.begin();site<lret_v.end();site++) {
-            vTComplex left;
+            thread_loop( (auto site=lret_v.begin();site<lret_v.end();site++) ,{
+		vTComplex left;
                for(int si=0; si < Ns; ++si){
                for(int sj=0; sj < Ns; ++sj){
                    for (int ci=0; ci < Nc; ++ci){
@@ -152,7 +152,7 @@ void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &l
                      lret_v[site]()(si,sj)(ci,cj)=left()*b_v[site]();
                    }}
                }}
-            }
+	      });
 #endif      
 }

--- a/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
+++ b/Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp
@@ -68,6 +68,7 @@ private:
    class OperatorFunctionWrapper: public OperatorFunction<Field>
    {
    public:
+        using OperatorFunction<Field>::operator();
        OperatorFunctionWrapper(LinearFunction<Field> &fn): fn_(fn) {};
        virtual ~OperatorFunctionWrapper(void) = default;
        virtual void operator()(LinearOperatorBase<Field> &op, 
--- a/Hadrons/Utilities/Contractor.cc
+++ b/Hadrons/Utilities/Contractor.cc
@@ -251,18 +251,18 @@ int main(int argc, char* argv[])

    // parse parameter file
    ContractorPar par;
-    unsigned int  nMat, nCont;
+    //    unsigned int  nMat,nCont;
    XmlReader     reader(parFilename);

    read(reader, "global",    par.global);
    read(reader, "a2aMatrix", par.a2aMatrix);
    read(reader, "product",   par.product);
-    nMat  = par.a2aMatrix.size();
-    nCont = par.product.size();
+    //    nMat  = par.a2aMatrix.size();
+    //    nCont = par.product.size();

    // create diskvectors
    std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
-    unsigned int                                     cacheSize;
+    //    unsigned int                                     cacheSize;

    for (auto &p: par.a2aMatrix)
    {
@@ -281,7 +281,8 @@ int main(int argc, char* argv[])
        for (auto &p: par.a2aMatrix)
        {
            std::string filename = p.file;
-            double      t, size;
+            double      t;
+	    //	    double  size;

            tokenReplace(filename, "traj", traj);
            std::cout << "======== Loading '" << filename << "'" << std::endl;
@@ -305,7 +306,8 @@ int main(int argc, char* argv[])
            std::vector<A2AMatrixTr<ComplexD>>     lastTerm(par.global.nt);
            A2AMatrix<ComplexD>                    prod, buf, tmp;
            TimerArray                             tAr;
-            double                                 fusec, busec, flops, bytes, tusec;
+            double                                 fusec, busec, flops, bytes;
+	    //	    double  tusec;
            Contractor::CorrelatorResult           result;             

            tAr.startTimer("Total");
@@ -351,11 +353,11 @@ int main(int argc, char* argv[])

                tAr.startTimer("Transpose caching");
                lastTerm[t].resize(ref.rows(), ref.cols());
-                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
+                thread_loop( (unsigned int j = 0; j < ref.cols(); ++j),
                for (unsigned int i = 0; i < ref.rows(); ++i)
                {
                    lastTerm[t](i, j) = ref(i, j);
-                }
+                });
                tAr.stopTimer("Transpose caching");
            }
            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -11,12 +11,14 @@ using namespace Hadrons;
 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
 MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
-MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank));\
+assert(rank<nMpi)
+
 #define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
 #define INIT() MPI_Init(NULL, NULL)
 #define FINALIZE() MPI_Finalize()
 #else
-#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 ; assert(rank<nMpi)
 #define BARRIER()
 #define INIT()
 #define FINALIZE()
@@ -47,7 +49,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
    if (rank == 0)
    {
        std::cout << std::setw(34) << name << ": diff= "
-                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(12) << abs(buf-ref)
                  << std::setw(10) << t/1.0e6 << " sec "
                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
@@ -61,7 +63,8 @@ inline void mulBenchmark(const std::string name, const MatV &left,
                         const MatV &right, const Mat &ref, Function fn)
 {
    double       t, flops, bytes;
-    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    double       nr = left[0].rows(), nc = left[0].cols();
+    //    double        n = nr*nc;
    unsigned int nMat = left.size();
    int          nMpi, rank;
    Mat          buf(left[0].rows(), left[0].rows());
@@ -202,7 +205,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
        auto nr = a.rows(), nc = a.cols();
        
        res = 0.;
-        parallel_for (unsigned int i = 0; i < nr; ++i)
+        thread_loop( (unsigned int i = 0; i < nr; ++i),
        {
            ComplexD tmp = 0.;

@@ -210,11 +213,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
            {
                tmp += a(i, j)*b(j, i);
            }
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Naive loop cols first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -222,7 +225,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
        auto nr = a.rows(), nc = a.cols();
        
        res = 0.;
-        parallel_for (unsigned int j = 0; j < nc; ++j)
+        thread_loop( (unsigned int j = 0; j < nc; ++j),
        {
            ComplexD tmp = 0.;

@@ -230,11 +233,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
            {
                tmp += a(i, j)*b(j, i);
            }        
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen tr(A*B)", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -245,31 +248,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
        {
            ComplexD tmp;

            tmp = a.row(r).conjugate().dot(b.col(r));
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen col-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
        {
            ComplexD tmp;

            tmp = a.col(c).conjugate().dot(b.row(c));
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen Hadamard", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -281,31 +284,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
        {
            ComplexD tmp;

            zdotuRow(tmp, r, a, b);
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("MKL col-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
        {
            ComplexD tmp;

            zdotuCol(tmp, c, a, b);
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
 #endif
    BARRIER();
@@ -403,11 +406,7 @@ int main(int argc, char *argv[])

        std::cout << nMpi << " MPI processes" << std::endl;
 #ifdef GRID_OMP
-        #pragma omp parallel
-        {
-            #pragma omp single
-            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
-        }
+	std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
 #else
        std::cout << "Single-threaded\n" << std::endl; 
 #endif