1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Merge GPU support (upstream/develop) into distillation branch.

This compiles and looks right ... but may need some testing

* develop: (762 commits)
  Tensor ambiguous fix
  Fix for GCC preprocessor/pragma handling bug
  Trips up NVCC for reasons I dont understand on summit
  Fix GCC complaint
  Zero() change
  Force a couple of things to compile on NVCC
  Remove debug code
  nvcc error suppress
  Merge develop
  Reduction finished and hopefully fixes CI regression fail on single precisoin and force
  Double precision variants for summation accuracy
  Update todo list
  Freeze the seed
  Fix compiling of MSource::Gauss for single precision
  Think the reduction is now sorted and cleaned up
  Fix force term
  Printing improvement
  GPU reduction fix and also exit backtrace option
  GPU friendly
  Simplify the comms benchmark
  ...

# Conflicts:
#	Grid/communicator/SharedMemoryMPI.cc
#	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
#	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
#	Grid/qcd/smearing/StoutSmearing.h
#	Hadrons/Modules.hpp
#	Hadrons/Utilities/Contractor.cc
#	Hadrons/modules.inc
#	tests/forces/Test_dwf_force_eofa.cc
#	tests/forces/Test_dwf_gpforce_eofa.cc
This commit is contained in:
Michael Marshall
2019-09-13 13:30:00 +01:00
796 changed files with 41536 additions and 52391 deletions

View File

@ -347,11 +347,12 @@ int main(int argc, char* argv[])
tAr.startTimer("Transpose caching");
lastTerm[t].resize(ref.rows(), ref.cols());
parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
for (unsigned int i = 0; i < ref.rows(); ++i)
{
lastTerm[t](i, j) = ref(i, j);
}
thread_for( j,ref.cols(),{
for (unsigned int i = 0; i < ref.rows(); ++i)
{
lastTerm[t](i, j) = ref(i, j);
}
});
tAr.stopTimer("Transpose caching");
}
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);

View File

@ -34,16 +34,20 @@ See the full license in the file "LICENSE" in the top level distribution directo
using namespace Grid;
using namespace Hadrons;
const int RowMajor = Eigen::RowMajor;
const int ColMajor = Eigen::ColMajor;
#ifdef GRID_COMMS_MPI3
#define GET_RANK(rank, nMpi) \
MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
MPI_Comm_rank(MPI_COMM_WORLD, &(rank));\
assert(rank<nMpi)
#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
#define INIT() MPI_Init(NULL, NULL)
#define FINALIZE() MPI_Finalize()
#else
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 ; assert(rank<nMpi)
#define BARRIER()
#define INIT()
#define FINALIZE()
@ -74,7 +78,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
if (rank == 0)
{
std::cout << std::setw(34) << name << ": diff= "
<< std::setw(12) << std::norm(buf-ref)
<< std::setw(12) << abs(buf-ref)
<< std::setw(10) << t/1.0e6 << " sec "
<< std::setw(10) << flops/t/1.0e3 << " GFlop/s "
<< std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
@ -88,7 +92,8 @@ inline void mulBenchmark(const std::string name, const MatV &left,
const MatV &right, const Mat &ref, Function fn)
{
double t, flops, bytes;
double nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
double nr = left[0].rows(), nc = left[0].cols();
// double n = nr*nc;
unsigned int nMat = left.size();
int nMpi, rank;
Mat buf(left[0].rows(), left[0].rows());
@ -126,22 +131,22 @@ static inline void zdotuRow(ComplexD &res, const unsigned int aRow,
const ComplexD *aPt, *bPt;
unsigned int aInc, bInc;
if (MatLeft::Options == Eigen::RowMajor)
if (MatLeft::Options == RowMajor)
{
aPt = a.data() + aRow*a.cols();
aInc = 1;
}
else if (MatLeft::Options == Eigen::ColMajor)
else if (MatLeft::Options == ColMajor)
{
aPt = a.data() + aRow;
aInc = a.rows();
}
if (MatRight::Options == Eigen::RowMajor)
if (MatRight::Options == RowMajor)
{
bPt = b.data() + aRow;
bInc = b.cols();
}
else if (MatRight::Options == Eigen::ColMajor)
else if (MatRight::Options == ColMajor)
{
bPt = b.data() + aRow*b.rows();
bInc = 1;
@ -156,22 +161,22 @@ static inline void zdotuCol(ComplexD &res, const unsigned int aCol,
const ComplexD *aPt, *bPt;
unsigned int aInc, bInc;
if (MatLeft::Options == Eigen::RowMajor)
if (MatLeft::Options == RowMajor)
{
aPt = a.data() + aCol;
aInc = a.cols();
}
else if (MatLeft::Options == Eigen::ColMajor)
else if (MatLeft::Options == ColMajor)
{
aPt = a.data() + aCol*a.rows();
aInc = 1;
}
if (MatRight::Options == Eigen::RowMajor)
if (MatRight::Options == RowMajor)
{
bPt = b.data() + aCol*b.cols();
bInc = 1;
}
else if (MatRight::Options == Eigen::ColMajor)
else if (MatRight::Options == ColMajor)
{
bPt = b.data() + aCol;
bInc = b.rows();
@ -196,20 +201,20 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
{
std::cout << "==== tr(A*B) benchmarks" << std::endl;
std::cout << "A matrices use ";
if (MatLeft::Options == Eigen::RowMajor)
if (MatLeft::Options == RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (MatLeft::Options == Eigen::ColMajor)
else if (MatLeft::Options == ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
std::cout << "B matrices use ";
if (MatRight::Options == Eigen::RowMajor)
if (MatRight::Options == RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (MatRight::Options == Eigen::ColMajor)
else if (MatRight::Options == ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
@ -229,7 +234,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
auto nr = a.rows(), nc = a.cols();
res = 0.;
parallel_for (unsigned int i = 0; i < nr; ++i)
thread_for(i,nr,
{
ComplexD tmp = 0.;
@ -237,11 +242,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
{
tmp += a(i, j)*b(j, i);
}
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
trBenchmark("Naive loop cols first", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
@ -249,7 +254,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
auto nr = a.rows(), nc = a.cols();
res = 0.;
parallel_for (unsigned int j = 0; j < nc; ++j)
thread_for(j,nc,
{
ComplexD tmp = 0.;
@ -257,11 +262,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
{
tmp += a(i, j)*b(j, i);
}
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
trBenchmark("Eigen tr(A*B)", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
@ -272,31 +277,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
thread_for(r,a.rows(),
{
ComplexD tmp;
tmp = a.row(r).conjugate().dot(b.col(r));
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
trBenchmark("Eigen col-wise dot", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
thread_for(c,a.cols(),
{
ComplexD tmp;
tmp = a.col(c).conjugate().dot(b.row(c));
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
trBenchmark("Eigen Hadamard", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
@ -308,31 +313,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
thread_for(r,a.rows(),
{
ComplexD tmp;
zdotuRow(tmp, r, a, b);
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
trBenchmark("MKL col-wise zdotu", left, right, ref,
[](ComplexD &res, const MatLeft &a, const MatRight &b)
{
res = 0.;
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
thread_for(c,a.cols(),
{
ComplexD tmp;
zdotuCol(tmp, c, a, b);
parallel_critical
thread_critical
{
res += tmp;
}
}
});
});
#endif
BARRIER();
@ -356,11 +361,11 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
{
std::cout << "==== A*B benchmarks" << std::endl;
std::cout << "all matrices use ";
if (Mat::Options == Eigen::RowMajor)
if (Mat::Options == RowMajor)
{
std::cout << "row-major ordering" << std::endl;
}
else if (Mat::Options == Eigen::ColMajor)
else if (Mat::Options == ColMajor)
{
std::cout << "col-major ordering" << std::endl;
}
@ -383,13 +388,13 @@ void fullMulBenchmark(const unsigned int ni, const unsigned int nj, const unsign
[](Mat &res, const Mat &a, const Mat &b)
{
const ComplexD one(1., 0.), zero(0., 0.);
if (Mat::Options == Eigen::RowMajor)
if (Mat::Options == RowMajor)
{
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
a.cols(), &one, a.data(), a.cols(), b.data(), b.cols(), &zero,
res.data(), res.cols());
}
else if (Mat::Options == Eigen::ColMajor)
else if (Mat::Options == ColMajor)
{
cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, a.rows(), b.cols(),
a.cols(), &one, a.data(), a.rows(), b.data(), b.rows(), &zero,
@ -430,11 +435,7 @@ int main(int argc, char *argv[])
std::cout << nMpi << " MPI processes" << std::endl;
#ifdef GRID_OMP
#pragma omp parallel
{
#pragma omp single
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
}
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
#else
std::cout << "Single-threaded\n" << std::endl;
#endif

View File

@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Hadrons/Environment.hpp>
using namespace Grid;
using namespace QCD;
using namespace Hadrons;
template <typename FOut, typename FIn>

View File

@ -29,7 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Hadrons/Application.hpp>
using namespace Grid;
using namespace QCD;
using namespace Hadrons;
int main(int argc, char *argv[])