mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Merge GPU support (upstream/develop) into distillation branch.
This compiles and looks right ... but may need some testing * develop: (762 commits) Tensor ambiguous fix Fix for GCC preprocessor/pragma handling bug Trips up NVCC for reasons I dont understand on summit Fix GCC complaint Zero() change Force a couple of things to compile on NVCC Remove debug code nvcc error suppress Merge develop Reduction finished and hopefully fixes CI regression fail on single precisoin and force Double precision variants for summation accuracy Update todo list Freeze the seed Fix compiling of MSource::Gauss for single precision Think the reduction is now sorted and cleaned up Fix force term Printing improvement GPU reduction fix and also exit backtrace option GPU friendly Simplify the comms benchmark ... # Conflicts: # Grid/communicator/SharedMemoryMPI.cc # Grid/qcd/action/fermion/WilsonKernelsAsm.cc # Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h # Grid/qcd/smearing/StoutSmearing.h # Hadrons/Modules.hpp # Hadrons/Utilities/Contractor.cc # Hadrons/modules.inc # tests/forces/Test_dwf_force_eofa.cc # tests/forces/Test_dwf_gpforce_eofa.cc
This commit is contained in:
@ -347,11 +347,12 @@ int main(int argc, char* argv[])
|
||||
|
||||
tAr.startTimer("Transpose caching");
|
||||
lastTerm[t].resize(ref.rows(), ref.cols());
|
||||
parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
|
||||
for (unsigned int i = 0; i < ref.rows(); ++i)
|
||||
{
|
||||
lastTerm[t](i, j) = ref(i, j);
|
||||
}
|
||||
thread_for( j,ref.cols(),{
|
||||
for (unsigned int i = 0; i < ref.rows(); ++i)
|
||||
{
|
||||
lastTerm[t](i, j) = ref(i, j);
|
||||
}
|
||||
});
|
||||
tAr.stopTimer("Transpose caching");
|
||||
}
|
||||
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
|
||||
|
@ -34,16 +34,20 @@ See the full license in the file "LICENSE" in the top level distribution directo
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Hadrons;
|
||||
const int RowMajor = Eigen::RowMajor;
|
||||
const int ColMajor = Eigen::ColMajor;
|
||||
|
||||
#ifdef GRID_COMMS_MPI3
|
||||
#define GET_RANK(rank, nMpi) \
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(rank));\
|
||||
assert(rank<nMpi)
|
||||
|
||||
#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
|
||||
#define INIT() MPI_Init(NULL, NULL)
|
||||
#define FINALIZE() MPI_Finalize()
|
||||
#else
|
||||
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
|
||||
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 ; assert(rank<nMpi)
|
||||
#define BARRIER()
|
||||
#define INIT()
|
||||
#define FINALIZE()
|
||||
@ -74,7 +78,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
|
||||
if (rank == 0)
|
||||
{
|
||||
std::cout << std::setw(34) << name << ": diff= "
|
||||
<< std::setw(12) << std::norm(buf-ref)
|
||||
<< std::setw(12) << abs(buf-ref)
|
||||
<< std::setw(10) << t/1.0e6 << " sec "
|
||||
<< std::setw(10) << flops/t/1.0e3 << " GFlop/s "
|
||||
<< std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
|
||||
@ -88,7 +92,8 @@ inline void mulBenchmark(const std::string name, const MatV &left,
|
||||
const MatV &right, const Mat &ref, Function fn)
|
||||
{
|
||||
double t, flops, bytes;
|
||||
double nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
|
||||
double nr = left[0].rows(), nc = left[0].cols();
|
||||
// double n = nr*nc;
|
||||
unsigned int nMat = left.size();
|
||||
int nMpi, rank;
|
||||
Mat buf(left[0].rows(), left[0].rows());
|
||||
@ -126,22 +131,22 @@ static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
|
||||
const ComplexD *aPt, *bPt;
|
||||
unsigned int aInc, bInc;
|
||||
|
||||
if (MatLeft::Options == Eigen::RowMajor)
|
||||
if (MatLeft::Options == RowMajor)
|
||||
{
|
||||
aPt = a.data() + aRow*a.cols();
|
||||
aInc = 1;
|
||||
}
|
||||
else if (MatLeft::Options == Eigen::ColMajor)
|
||||
else if (MatLeft::Options == ColMajor)
|
||||
{
|
||||
aPt = a.data() + aRow;
|
||||
aInc = a.rows();
|
||||
}
|
||||
if (MatRight::Options == Eigen::RowMajor)
|
||||
if (MatRight::Options == RowMajor)
|
||||
{
|
||||
bPt = b.data() + aRow;
|
||||
bInc = b.cols();
|
||||
}
|
||||
else if (MatRight::Options == Eigen::ColMajor)
|
||||
else if (MatRight::Options == ColMajor)
|
||||
{
|
||||
bPt = b.data() + aRow*b.rows();
|
||||
bInc = 1;
|
||||
@ -156,22 +161,22 @@ static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
|
||||
const ComplexD *aPt, *bPt;
|
||||
unsigned int aInc, bInc;
|
||||
|
||||
if (MatLeft::Options == Eigen::RowMajor)
|
||||
if (MatLeft::Options == RowMajor)
|
||||
{
|
||||
aPt = a.data() + aCol;
|
||||
aInc = a.cols();
|
||||
}
|
||||
else if (MatLeft::Options == Eigen::ColMajor)
|
||||
else if (MatLeft::Options == ColMajor)
|
||||
{
|
||||
aPt = a.data() + aCol*a.rows();
|
||||
aInc = 1;
|
||||
}
|
||||
if (MatRight::Options == Eigen::RowMajor)
|
||||
if (MatRight::Options == RowMajor)
|
||||
{
|
||||
bPt = b.data() + aCol*b.cols();
|
||||
bInc = 1;
|
||||
}
|
||||
else if (MatRight::Options == Eigen::ColMajor)
|
||||
else if (MatRight::Options == ColMajor)
|
||||
{
|
||||
bPt = b.data() + aCol;
|
||||
bInc = b.rows();
|
||||
@ -196,20 +201,20 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
{
|
||||
std::cout << "==== tr(A*B) benchmarks" << std::endl;
|
||||
std::cout << "A matrices use ";
|
||||
if (MatLeft::Options == Eigen::RowMajor)
|
||||
if (MatLeft::Options == RowMajor)
|
||||
{
|
||||
std::cout << "row-major ordering" << std::endl;
|
||||
}
|
||||
else if (MatLeft::Options == Eigen::ColMajor)
|
||||
else if (MatLeft::Options == ColMajor)
|
||||
{
|
||||
std::cout << "col-major ordering" << std::endl;
|
||||
}
|
||||
std::cout << "B matrices use ";
|
||||
if (MatRight::Options == Eigen::RowMajor)
|
||||
if (MatRight::Options == RowMajor)
|
||||
{
|
||||
std::cout << "row-major ordering" << std::endl;
|
||||
}
|
||||
else if (MatRight::Options == Eigen::ColMajor)
|
||||
else if (MatRight::Options == ColMajor)
|
||||
{
|
||||
std::cout << "col-major ordering" << std::endl;
|
||||
}
|
||||
@ -229,7 +234,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
auto nr = a.rows(), nc = a.cols();
|
||||
|
||||
res = 0.;
|
||||
parallel_for (unsigned int i = 0; i < nr; ++i)
|
||||
thread_for(i,nr,
|
||||
{
|
||||
ComplexD tmp = 0.;
|
||||
|
||||
@ -237,11 +242,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
{
|
||||
tmp += a(i, j)*b(j, i);
|
||||
}
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Naive loop cols first", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -249,7 +254,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
auto nr = a.rows(), nc = a.cols();
|
||||
|
||||
res = 0.;
|
||||
parallel_for (unsigned int j = 0; j < nc; ++j)
|
||||
thread_for(j,nc,
|
||||
{
|
||||
ComplexD tmp = 0.;
|
||||
|
||||
@ -257,11 +262,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
{
|
||||
tmp += a(i, j)*b(j, i);
|
||||
}
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen tr(A*B)", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -272,31 +277,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
|
||||
thread_for(r,a.rows(),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
tmp = a.row(r).conjugate().dot(b.col(r));
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen col-wise dot", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
|
||||
thread_for(c,a.cols(),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
tmp = a.col(c).conjugate().dot(b.row(c));
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen Hadamard", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -308,31 +313,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
|
||||
thread_for(r,a.rows(),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
zdotuRow(tmp, r, a, b);
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("MKL col-wise zdotu", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
|
||||
thread_for(c,a.cols(),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
zdotuCol(tmp, c, a, b);
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
#endif
|
||||
BARRIER();
|
||||
@ -356,11 +361,11 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
|
||||
{
|
||||
std::cout << "==== A*B benchmarks" << std::endl;
|
||||
std::cout << "all matrices use ";
|
||||
if (Mat::Options == Eigen::RowMajor)
|
||||
if (Mat::Options == RowMajor)
|
||||
{
|
||||
std::cout << "row-major ordering" << std::endl;
|
||||
}
|
||||
else if (Mat::Options == Eigen::ColMajor)
|
||||
else if (Mat::Options == ColMajor)
|
||||
{
|
||||
std::cout << "col-major ordering" << std::endl;
|
||||
}
|
||||
@ -383,13 +388,13 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
|
||||
[](Mat &res, const Mat &a, const Mat &b)
|
||||
{
|
||||
const ComplexD one(1., 0.), zero(0., 0.);
|
||||
if (Mat::Options == Eigen::RowMajor)
|
||||
if (Mat::Options == RowMajor)
|
||||
{
|
||||
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
|
||||
a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
|
||||
res.data(), res.cols());
|
||||
}
|
||||
else if (Mat::Options == Eigen::ColMajor)
|
||||
else if (Mat::Options == ColMajor)
|
||||
{
|
||||
cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
|
||||
a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
|
||||
@ -430,11 +435,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
std::cout << nMpi << " MPI processes" << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp single
|
||||
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
|
||||
}
|
||||
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
|
||||
#else
|
||||
std::cout << "Single-threaded\n" << std::endl;
|
||||
#endif
|
||||
|
@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
|
||||
#include <Hadrons/Environment.hpp>
|
||||
|
||||
using namespace Grid;
|
||||
using namespace QCD;
|
||||
using namespace Hadrons;
|
||||
|
||||
template <typename FOut, typename FIn>
|
||||
|
@ -29,7 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
|
||||
#include <Hadrons/Application.hpp>
|
||||
|
||||
using namespace Grid;
|
||||
using namespace QCD;
|
||||
|
||||
using namespace Hadrons;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
|
Reference in New Issue
Block a user