mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Remove warnings under NVCC and move parallel_for to thread-loop
This commit is contained in:
parent
0e9b591c1c
commit
802404c78c
@ -170,7 +170,7 @@ public:
|
||||
if ((MatLeft::Options == Eigen::RowMajor) and
|
||||
(MatRight::Options == Eigen::ColMajor))
|
||||
{
|
||||
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
|
||||
thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
|
||||
{
|
||||
C tmp;
|
||||
#ifdef USE_MKL
|
||||
@ -178,15 +178,15 @@ public:
|
||||
#else
|
||||
tmp = a.row(r).conjugate().dot(b.col(r));
|
||||
#endif
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
acc += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
|
||||
{
|
||||
thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
|
||||
{
|
||||
C tmp;
|
||||
#ifdef USE_MKL
|
||||
@ -194,11 +194,11 @@ public:
|
||||
#else
|
||||
tmp = a.col(c).conjugate().dot(b.row(c));
|
||||
#endif
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
acc += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -646,14 +646,14 @@ void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
||||
bytes += kernel.bytes(N_iii, N_jjj);
|
||||
|
||||
START_TIMER("cache copy");
|
||||
parallel_for_nest5(int e =0;e<next_;e++)
|
||||
thread_loop_collapse( 5, (int e =0;e<next_;e++),
|
||||
for(int s =0;s< nstr_;s++)
|
||||
for(int t =0;t< nt_;t++)
|
||||
for(int iii=0;iii< N_iii;iii++)
|
||||
for(int jjj=0;jjj< N_jjj;jjj++)
|
||||
{
|
||||
mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
|
||||
}
|
||||
});
|
||||
STOP_TIMER("cache copy");
|
||||
}
|
||||
|
||||
|
@ -77,7 +77,7 @@ size_t Hadrons::typeHash(const std::type_info *info)
|
||||
return info->hash_code();
|
||||
}
|
||||
|
||||
constexpr unsigned int maxNameSize = 1024u;
|
||||
//constexpr unsigned int maxNameSize = 1024u;
|
||||
|
||||
std::string Hadrons::typeName(const std::type_info *info)
|
||||
{
|
||||
|
@ -49,7 +49,7 @@ public:
|
||||
double , M5,
|
||||
double , b,
|
||||
double , c,
|
||||
std::vector<std::complex<double>>, omega,
|
||||
std::vector<std::complex<double> >, omega,
|
||||
std::string , boundary,
|
||||
std::string , twist);
|
||||
};
|
||||
@ -131,9 +131,28 @@ void TZMobiusDWF<FImpl>::setup(void)
|
||||
typename ZMobiusFermion<FImpl>::ImplParams implParams;
|
||||
implParams.boundary_phases = strToVec<Complex>(par().boundary);
|
||||
implParams.twist_n_2pi_L = strToVec<Real>(par().twist);
|
||||
envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls, U, g5,
|
||||
grb5, g4, grb4, par().mass, par().M5, omega,
|
||||
par().b, par().c, implParams);
|
||||
|
||||
assert(par().Ls==omega.size());
|
||||
int Ls=par().Ls;
|
||||
std::vector<ComplexD> _omega(Ls);
|
||||
for(int i=0;i<Ls;i++){
|
||||
_omega[i] = omega[i];
|
||||
}
|
||||
envCreateDerived(FMat, ZMobiusFermion<FImpl>, getName(), par().Ls,
|
||||
U, g5, grb5, g4, grb4,
|
||||
par().mass, par().M5,
|
||||
_omega, par().b, par().c, implParams);
|
||||
|
||||
/*
|
||||
ZMobiusFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD _M5,
|
||||
std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) :
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
// execution ///////////////////////////////////////////////////////////////////
|
||||
|
@ -140,8 +140,8 @@ void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &l
|
||||
auto lret_v = lret.View();
|
||||
auto a_v = a.View();
|
||||
auto b_v = b.View();
|
||||
parallel_for(auto site=lret_v.begin();site<lret_v.end();site++) {
|
||||
vTComplex left;
|
||||
thread_loop( (auto site=lret_v.begin();site<lret_v.end();site++) ,{
|
||||
vTComplex left;
|
||||
for(int si=0; si < Ns; ++si){
|
||||
for(int sj=0; sj < Ns; ++sj){
|
||||
for (int ci=0; ci < Nc; ++ci){
|
||||
@ -152,7 +152,7 @@ void TFourQuark<FImpl1, FImpl2>::tensorprod(LatticeSpinColourSpinColourMatrix &l
|
||||
lret_v[site]()(si,sj)(ci,cj)=left()*b_v[site]();
|
||||
}}
|
||||
}}
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -68,6 +68,7 @@ private:
|
||||
class OperatorFunctionWrapper: public OperatorFunction<Field>
|
||||
{
|
||||
public:
|
||||
using OperatorFunction<Field>::operator();
|
||||
OperatorFunctionWrapper(LinearFunction<Field> &fn): fn_(fn) {};
|
||||
virtual ~OperatorFunctionWrapper(void) = default;
|
||||
virtual void operator()(LinearOperatorBase<Field> &op,
|
||||
|
@ -251,18 +251,18 @@ int main(int argc, char* argv[])
|
||||
|
||||
// parse parameter file
|
||||
ContractorPar par;
|
||||
unsigned int nMat, nCont;
|
||||
// unsigned int nMat,nCont;
|
||||
XmlReader reader(parFilename);
|
||||
|
||||
read(reader, "global", par.global);
|
||||
read(reader, "a2aMatrix", par.a2aMatrix);
|
||||
read(reader, "product", par.product);
|
||||
nMat = par.a2aMatrix.size();
|
||||
nCont = par.product.size();
|
||||
// nMat = par.a2aMatrix.size();
|
||||
// nCont = par.product.size();
|
||||
|
||||
// create diskvectors
|
||||
std::map<std::string, EigenDiskVector<ComplexD>> a2aMat;
|
||||
unsigned int cacheSize;
|
||||
// unsigned int cacheSize;
|
||||
|
||||
for (auto &p: par.a2aMatrix)
|
||||
{
|
||||
@ -281,7 +281,8 @@ int main(int argc, char* argv[])
|
||||
for (auto &p: par.a2aMatrix)
|
||||
{
|
||||
std::string filename = p.file;
|
||||
double t, size;
|
||||
double t;
|
||||
// double size;
|
||||
|
||||
tokenReplace(filename, "traj", traj);
|
||||
std::cout << "======== Loading '" << filename << "'" << std::endl;
|
||||
@ -305,7 +306,8 @@ int main(int argc, char* argv[])
|
||||
std::vector<A2AMatrixTr<ComplexD>> lastTerm(par.global.nt);
|
||||
A2AMatrix<ComplexD> prod, buf, tmp;
|
||||
TimerArray tAr;
|
||||
double fusec, busec, flops, bytes, tusec;
|
||||
double fusec, busec, flops, bytes;
|
||||
// double tusec;
|
||||
Contractor::CorrelatorResult result;
|
||||
|
||||
tAr.startTimer("Total");
|
||||
@ -351,11 +353,11 @@ int main(int argc, char* argv[])
|
||||
|
||||
tAr.startTimer("Transpose caching");
|
||||
lastTerm[t].resize(ref.rows(), ref.cols());
|
||||
parallel_for (unsigned int j = 0; j < ref.cols(); ++j)
|
||||
thread_loop( (unsigned int j = 0; j < ref.cols(); ++j),
|
||||
for (unsigned int i = 0; i < ref.rows(); ++i)
|
||||
{
|
||||
lastTerm[t](i, j) = ref(i, j);
|
||||
}
|
||||
});
|
||||
tAr.stopTimer("Transpose caching");
|
||||
}
|
||||
bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD);
|
||||
|
@ -11,12 +11,14 @@ using namespace Hadrons;
|
||||
#ifdef GRID_COMMS_MPI3
|
||||
#define GET_RANK(rank, nMpi) \
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(rank))
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(rank));\
|
||||
assert(rank<nMpi)
|
||||
|
||||
#define BARRIER() MPI_Barrier(MPI_COMM_WORLD)
|
||||
#define INIT() MPI_Init(NULL, NULL)
|
||||
#define FINALIZE() MPI_Finalize()
|
||||
#else
|
||||
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0
|
||||
#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 ; assert(rank<nMpi)
|
||||
#define BARRIER()
|
||||
#define INIT()
|
||||
#define FINALIZE()
|
||||
@ -47,7 +49,7 @@ inline void trBenchmark(const std::string name, const MatLeft &left,
|
||||
if (rank == 0)
|
||||
{
|
||||
std::cout << std::setw(34) << name << ": diff= "
|
||||
<< std::setw(12) << std::norm(buf-ref)
|
||||
<< std::setw(12) << abs(buf-ref)
|
||||
<< std::setw(10) << t/1.0e6 << " sec "
|
||||
<< std::setw(10) << flops/t/1.0e3 << " GFlop/s "
|
||||
<< std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s "
|
||||
@ -61,7 +63,8 @@ inline void mulBenchmark(const std::string name, const MatV &left,
|
||||
const MatV &right, const Mat &ref, Function fn)
|
||||
{
|
||||
double t, flops, bytes;
|
||||
double nr = left[0].rows(), nc = left[0].cols(), n = nr*nc;
|
||||
double nr = left[0].rows(), nc = left[0].cols();
|
||||
// double n = nr*nc;
|
||||
unsigned int nMat = left.size();
|
||||
int nMpi, rank;
|
||||
Mat buf(left[0].rows(), left[0].rows());
|
||||
@ -202,7 +205,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
auto nr = a.rows(), nc = a.cols();
|
||||
|
||||
res = 0.;
|
||||
parallel_for (unsigned int i = 0; i < nr; ++i)
|
||||
thread_loop( (unsigned int i = 0; i < nr; ++i),
|
||||
{
|
||||
ComplexD tmp = 0.;
|
||||
|
||||
@ -210,11 +213,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
{
|
||||
tmp += a(i, j)*b(j, i);
|
||||
}
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Naive loop cols first", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -222,7 +225,7 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
auto nr = a.rows(), nc = a.cols();
|
||||
|
||||
res = 0.;
|
||||
parallel_for (unsigned int j = 0; j < nc; ++j)
|
||||
thread_loop( (unsigned int j = 0; j < nc; ++j),
|
||||
{
|
||||
ComplexD tmp = 0.;
|
||||
|
||||
@ -230,11 +233,11 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
{
|
||||
tmp += a(i, j)*b(j, i);
|
||||
}
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen tr(A*B)", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -245,31 +248,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
|
||||
thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
tmp = a.row(r).conjugate().dot(b.col(r));
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen col-wise dot", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
|
||||
thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
tmp = a.col(c).conjugate().dot(b.row(c));
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("Eigen Hadamard", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
@ -281,31 +284,31 @@ void fullTrBenchmark(const unsigned int ni, const unsigned int nj, const unsigne
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int r = 0; r < a.rows(); ++r)
|
||||
thread_loop( (unsigned int r = 0; r < a.rows(); ++r),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
zdotuRow(tmp, r, a, b);
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
trBenchmark("MKL col-wise zdotu", left, right, ref,
|
||||
[](ComplexD &res, const MatLeft &a, const MatRight &b)
|
||||
{
|
||||
res = 0.;
|
||||
parallel_for (unsigned int c = 0; c < a.cols(); ++c)
|
||||
thread_loop( (unsigned int c = 0; c < a.cols(); ++c),
|
||||
{
|
||||
ComplexD tmp;
|
||||
|
||||
zdotuCol(tmp, c, a, b);
|
||||
parallel_critical
|
||||
thread_critical
|
||||
{
|
||||
res += tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
#endif
|
||||
BARRIER();
|
||||
@ -403,11 +406,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
std::cout << nMpi << " MPI processes" << std::endl;
|
||||
#ifdef GRID_OMP
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp single
|
||||
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
|
||||
}
|
||||
std::cout << omp_get_num_threads() << " threads\n" << std::endl;
|
||||
#else
|
||||
std::cout << "Single-threaded\n" << std::endl;
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user