Merge GPU support (upstream/develop) into distillation branch.

This compiles and looks right ... but may need some testing * develop: (762 commits) Tensor ambiguous fix Fix for GCC preprocessor/pragma handling bug Trips up NVCC for reasons I dont understand on summit Fix GCC complaint Zero() change Force a couple of things to compile on NVCC Remove debug code nvcc error suppress Merge develop Reduction finished and hopefully fixes CI regression fail on single precisoin and force Double precision variants for summation accuracy Update todo list Freeze the seed Fix compiling of MSource::Gauss for single precision Think the reduction is now sorted and cleaned up Fix force term Printing improvement GPU reduction fix and also exit backtrace option GPU friendly Simplify the comms benchmark ... # Conflicts: # Grid/communicator/SharedMemoryMPI.cc # Grid/qcd/action/fermion/WilsonKernelsAsm.cc # Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h # Grid/qcd/smearing/StoutSmearing.h # Hadrons/Modules.hpp # Hadrons/Utilities/Contractor.cc # Hadrons/modules.inc # tests/forces/Test_dwf_force_eofa.cc # tests/forces/Test_dwf_gpforce_eofa.cc
2025-07-28 02:07:07 +01:00 · 2019-09-13 13:30:00 +01:00
parent 04a661cafe b473405652
commit 61d017d0a5
796 changed files with 41536 additions and 52391 deletions
--- a/Hadrons/Utilities/Contractor.cc
+++ b/Hadrons/Utilities/Contractor.cc
@@ -347,11 +347,12 @@ int main(int argc, char* argv[])

                tAr.startTimer("Transpose caching");
                lastTerm[t].resize(ref.rows(), ref.cols());
-                parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
-                for (unsigned int i = 0; i < ref.rows(); ++i)
-                {
-                    lastTerm[t](i, j) = ref(i, j);
-                }
+                thread_for( j,ref.cols(),{
+                    for (unsigned int i = 0; i < ref.rows(); ++i)
+                    {
+                        lastTerm[t](i, j) = ref(i, j);
+                    }
+                });
                tAr.stopTimer("Transpose caching");
            }
            bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
--- a/Hadrons/Utilities/ContractorBenchmark.cc
+++ b/Hadrons/Utilities/ContractorBenchmark.cc
@@ -34,16 +34,20 @@ See the full license in the file "LICENSE" in the top level distribution directo

 using namespace Grid;
 using namespace Hadrons;
+const int RowMajor = Eigen::RowMajor;
+const int ColMajor = Eigen::ColMajor;

 #ifdef GRID_COMMS_MPI3
 #define GET_RANK(rank, nMpi) \
 MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
-MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
+MPI_Comm_rank(MPI_COMM_WORLD, &(rank));\
+assert(rank<nMpi)
+
 #define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
 #define INIT() MPI_Init(NULL, NULL)
 #define FINALIZE() MPI_Finalize()
 #else
-#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
+#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 ; assert(rank<nMpi)
 #define BARRIER()
 #define INIT()
 #define FINALIZE()
@@ -74,7 +78,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
    if (rank == 0)
    {
        std::cout << std::setw(34) << name << ": diff= "
-                  << std::setw(12) << std::norm(buf-ref)
+                  << std::setw(12) << abs(buf-ref)
                  << std::setw(10) << t/1.0e6 << " sec "
                  << std::setw(10) << flops/t/1.0e3 << " GFlop/s " 
                  << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
@@ -88,7 +92,8 @@ inline void mulBenchmark(const std::string name, const MatV &left,
                         const MatV &right, const Mat &ref, Function fn)
 {
    double       t, flops, bytes;
-    double       nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
+    double       nr = left[0].rows(), nc = left[0].cols();
+    //    double        n = nr*nc;
    unsigned int nMat = left.size();
    int          nMpi, rank;
    Mat          buf(left[0].rows(), left[0].rows());
@@ -126,22 +131,22 @@ static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
    const ComplexD *aPt, *bPt;
    unsigned int   aInc, bInc;

-    if (MatLeft::Options == Eigen::RowMajor)
+    if (MatLeft::Options == RowMajor)
    {
        aPt  = a.data() + aRow*a.cols();
        aInc = 1;
    }
-    else if (MatLeft::Options == Eigen::ColMajor)
+    else if (MatLeft::Options == ColMajor)
    {
        aPt  = a.data() + aRow;
        aInc = a.rows();
    }
-    if (MatRight::Options == Eigen::RowMajor)
+    if (MatRight::Options == RowMajor)
    {
        bPt  = b.data() + aRow;
        bInc = b.cols();
    }
-    else if (MatRight::Options == Eigen::ColMajor)
+    else if (MatRight::Options == ColMajor)
    {
        bPt  = b.data() + aRow*b.rows();
        bInc = 1;
@@ -156,22 +161,22 @@ static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
    const ComplexD *aPt, *bPt;
    unsigned int   aInc, bInc;

-    if (MatLeft::Options == Eigen::RowMajor)
+    if (MatLeft::Options == RowMajor)
    {
        aPt  = a.data() + aCol;
        aInc = a.cols();
    }
-    else if (MatLeft::Options == Eigen::ColMajor)
+    else if (MatLeft::Options == ColMajor)
    {
        aPt  = a.data() + aCol*a.rows();
        aInc = 1;
    }
-    if (MatRight::Options == Eigen::RowMajor)
+    if (MatRight::Options == RowMajor)
    {
        bPt  = b.data() + aCol*b.cols();
        bInc = 1;
    }
-    else if (MatRight::Options == Eigen::ColMajor)
+    else if (MatRight::Options == ColMajor)
    {
        bPt  = b.data() + aCol;
        bInc = b.rows();
@@ -196,20 +201,20 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    {
        std::cout << "==== tr(A*B) benchmarks" << std::endl;
        std::cout << "A matrices use ";
-        if (MatLeft::Options == Eigen::RowMajor)
+        if (MatLeft::Options == RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
-        else if (MatLeft::Options == Eigen::ColMajor)
+        else if (MatLeft::Options == ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
        std::cout << "B matrices use ";
-        if (MatRight::Options == Eigen::RowMajor)
+        if (MatRight::Options == RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
-        else if (MatRight::Options == Eigen::ColMajor)
+        else if (MatRight::Options == ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
@@ -229,7 +234,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
        auto nr = a.rows(), nc = a.cols();
        
        res = 0.;
-        parallel_for (unsigned int i = 0; i < nr; ++i)
+        thread_for(i,nr,
        {
            ComplexD tmp = 0.;

@@ -237,11 +242,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
            {
                tmp += a(i, j)*b(j, i);
            }
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Naive loop cols first", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -249,7 +254,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
        auto nr = a.rows(), nc = a.cols();
        
        res = 0.;
-        parallel_for (unsigned int j = 0; j < nc; ++j)
+        thread_for(j,nc,
        {
            ComplexD tmp = 0.;

@@ -257,11 +262,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
            {
                tmp += a(i, j)*b(j, i);
            }        
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen tr(A*B)", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -272,31 +277,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        thread_for(r,a.rows(),
        {
            ComplexD tmp;

            tmp = a.row(r).conjugate().dot(b.col(r));
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen col-wise dot", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        thread_for(c,a.cols(),
        {
            ComplexD tmp;

            tmp = a.col(c).conjugate().dot(b.row(c));
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("Eigen Hadamard", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
@@ -308,31 +313,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int r = 0; r < a.rows(); ++r)
+        thread_for(r,a.rows(),
        {
            ComplexD tmp;

            zdotuRow(tmp, r, a, b);
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
    trBenchmark("MKL col-wise zdotu", left, right, ref,
    [](ComplexD &res, const MatLeft &a, const MatRight &b)
    {
        res = 0.;
-        parallel_for (unsigned int c = 0; c < a.cols(); ++c)
+        thread_for(c,a.cols(),
        {
            ComplexD tmp;

            zdotuCol(tmp, c, a, b);
-            parallel_critical
+            thread_critical
            {
                res += tmp;
            }
-        }
+        });
    });
 #endif
    BARRIER();
@@ -356,11 +361,11 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
    {
        std::cout << "==== A*B benchmarks" << std::endl;
        std::cout << "all matrices use ";
-        if (Mat::Options == Eigen::RowMajor)
+        if (Mat::Options == RowMajor)
        {
            std::cout << "row-major ordering" << std::endl;
        }
-        else if (Mat::Options == Eigen::ColMajor)
+        else if (Mat::Options == ColMajor)
        {
            std::cout << "col-major ordering" << std::endl;
        }
@@ -383,13 +388,13 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
    [](Mat &res, const Mat &a, const Mat &b)
    {
        const ComplexD one(1., 0.), zero(0., 0.);
-        if (Mat::Options == Eigen::RowMajor)
+        if (Mat::Options == RowMajor)
        {
            cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
                        res.data(), res.cols());
        }
-        else if (Mat::Options == Eigen::ColMajor)
+        else if (Mat::Options == ColMajor)
        {
            cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
                        a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
@@ -430,11 +435,7 @@ int main(int argc, char *argv[])

        std::cout << nMpi << " MPI processes" << std::endl;
 #ifdef GRID_OMP
-        #pragma omp parallel
-        {
-            #pragma omp single
-            std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
-        }
+	std::cout << omp_get_num_threads() << " threads\n" << std::endl; 
 #else
        std::cout << "Single-threaded\n" << std::endl; 
 #endif
--- a/Hadrons/Utilities/EigenPackCast.cc
+++ b/Hadrons/Utilities/EigenPackCast.cc
@@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Environment.hpp>

 using namespace Grid;
-using namespace QCD;
 using namespace Hadrons;

 template <typename FOut, typename FIn>
--- a/Hadrons/Utilities/HadronsXmlRun.cc
+++ b/Hadrons/Utilities/HadronsXmlRun.cc
@@ -29,7 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Application.hpp>

 using namespace Grid;
-using namespace QCD;
+ 
 using namespace Hadrons;

 int main(int argc, char *argv[])