diff --git a/Hadrons/Application.hpp b/Hadrons/Application.hpp index 3578c919..d336cc5c 100644 --- a/Hadrons/Application.hpp +++ b/Hadrons/Application.hpp @@ -41,14 +41,6 @@ BEGIN_HADRONS_NAMESPACE class Application { public: - class TrajRange: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange, - unsigned int, start, - unsigned int, end, - unsigned int, step); - }; class GlobalPar: Serializable { public: diff --git a/Hadrons/Global.hpp b/Hadrons/Global.hpp index aaf03427..ff1db409 100644 --- a/Hadrons/Global.hpp +++ b/Hadrons/Global.hpp @@ -263,6 +263,33 @@ void tokenReplace(std::string &str, const std::string token, } } +// trajectory range +class TrajRange: Serializable +{ +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange, + unsigned int, start, + unsigned int, end, + unsigned int, step, + std::string, exclude); + + inline std::vector getTrajectoryList(void) + { + std::vector excVec = strToVec(exclude); + std::vector list; + + for (unsigned int t = start; t < end; t += step) + { + if (std::find(excVec.begin(), excVec.end(), t) != excVec.end()) + { + list.push_back(t); + } + } + + return list; + } +}; + END_HADRONS_NAMESPACE #include diff --git a/Hadrons/Makefile.am b/Hadrons/Makefile.am index b748085a..1e8657bb 100644 --- a/Hadrons/Makefile.am +++ b/Hadrons/Makefile.am @@ -34,4 +34,5 @@ nobase_libHadrons_a_HEADERS = \ Solver.hpp \ TimerArray.hpp \ VirtualMachine.hpp \ + Utilities/Contractor.hpp \ $(modules_hpp) diff --git a/Hadrons/Utilities/Contractor.cc b/Hadrons/Utilities/Contractor.cc index 11ea0bc1..84ebefe7 100644 --- a/Hadrons/Utilities/Contractor.cc +++ b/Hadrons/Utilities/Contractor.cc @@ -28,6 +28,25 @@ See the full license in the file "LICENSE" in the top level distribution directo #include #include #include +#include + +#ifdef GRID_COMMS_MPI3 +#define GET_RANK(rank, nMpi) \ +MPI_Comm_size(MPI_COMM_WORLD, &(nMpi));\ +MPI_Comm_rank(MPI_COMM_WORLD, &(rank)) +#define BARRIER() MPI_Barrier(MPI_COMM_WORLD) +#define GLOBAL_DSUM(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) +#define GLOBAL_DMAX(x) MPI_Allreduce(MPI_IN_PLACE, &x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD) +#define INIT() MPI_Init(NULL, NULL) +#define FINALIZE() MPI_Finalize() +#else +#define GET_RANK(rank, nMpi) (nMpi) = 1; (rank) = 0 +#define BARRIER() +#define GLOBAL_DSUM(x) +#define GLOBAL_DMAX(x) +#define INIT() +#define FINALIZE() +#endif using namespace Grid; using namespace QCD; @@ -35,58 +54,6 @@ using namespace Hadrons; #define TIME_MOD(t) (((t) + par.global.nt) % par.global.nt) -namespace Contractor -{ - class TrajRange: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(TrajRange, - unsigned int, start, - unsigned int, end, - unsigned int, step); - }; - - class GlobalPar: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar, - TrajRange, trajCounter, - unsigned int, nt, - std::string, diskVectorDir, - std::string, output); - }; - - class A2AMatrixPar: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar, - std::string, file, - std::string, dataset, - unsigned int, cacheSize, - std::string, name); - }; - - class ProductPar: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar, - std::string, terms, - std::vector, times, - std::string, translations, - bool, translationAverage); - }; - - class CorrelatorResult: Serializable - { - public: - GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult, - std::vector, a2aMatrix, - ProductPar, contraction, - std::vector, times, - std::vector, correlator); - }; -} - struct ContractorPar { Contractor::GlobalPar global; @@ -143,6 +110,27 @@ void saveCorrelator(const Contractor::CorrelatorResult &result, const std::strin write(writer, fileStem, result); } +void printPerf(const double bytes, const double usec) +{ + double maxt; + + maxt = usec; + GLOBAL_DMAX(maxt); + std::cout << maxt << " sec " << bytes/maxt*1.0e6/1024/1024/1024 << " GB/s"; +} + +void printPerf(const double bytes, const double busec, + const double flops, const double fusec) +{ + double maxt; + + printPerf(bytes, busec); + std::cout << " "; + maxt = fusec; + GLOBAL_DMAX(maxt); + std::cout << flops/fusec/1.0e3 << " GFlop/s"; +} + std::set parseTimeRange(const std::string str, const unsigned int nt) { std::regex rex("([0-9]+)|(([0-9]+)\\.\\.([0-9]+))"); @@ -184,59 +172,18 @@ std::set parseTimeRange(const std::string str, const unsigned int return tSet; } -struct Sec -{ - Sec(const double usec) - { - seconds = usec/1.0e6; - } - - double seconds; -}; - -inline std::ostream & operator<< (std::ostream& s, const Sec &&sec) -{ - s << std::setw(10) << sec.seconds << " sec"; - - return s; -} - -struct Flops -{ - Flops(const double flops, const double fusec) - { - gFlopsPerSec = flops/fusec/1.0e3; - } - - double gFlopsPerSec; -}; - -inline std::ostream & operator<< (std::ostream& s, const Flops &&f) -{ - s << std::setw(10) << f.gFlopsPerSec << " GFlop/s"; - - return s; -} - -struct Bytes -{ - Bytes(const double bytes, const double busec) - { - gBytesPerSec = bytes/busec*1.0e6/1024/1024/1024; - } - - double gBytesPerSec; -}; - -inline std::ostream & operator<< (std::ostream& s, const Bytes &&b) -{ - s << std::setw(10) << b.gBytesPerSec << " GB/s"; - - return s; -} - int main(int argc, char* argv[]) { + // MPI init + int nMpi, rank; + + INIT(); + GET_RANK(rank, nMpi); + if (rank != 0) + { + std::cout.setstate(std::ios::badbit); + } + // parse command line std::string parFilename; @@ -272,25 +219,63 @@ int main(int argc, char* argv[]) } // trajectory loop - for (unsigned int traj = par.global.trajCounter.start; - traj < par.global.trajCounter.end; traj += par.global.trajCounter.step) - { - std::cout << ":::::::: Trajectory " << traj << std::endl; + std::vector tList = par.global.trajCounter.getTrajectoryList(); + unsigned int indi, inde, indPerRank; + indPerRank = tList.size()/nMpi; + indi = rank*indPerRank; + + BARRIER(); + for (unsigned int tInd = indi; tInd < indi + indPerRank; tInd++) + { + unsigned int traj; + + if (tInd < tList.size()) + { + traj = tList[tInd]; + } + else + { + traj = tList.back(); + } + if (nMpi > 1) + { + if (rank == 0) + { + std::cout << ":::::::: Trajectories "; + for (unsigned int r = 0; r < nMpi - 1; ++r) + { + std::cout << tList[tInd + r*indPerRank] << " "; + } + if (tInd + (nMpi - 1)*indPerRank < tList.size()) + { + std::cout << tList[tInd + (nMpi - 1)*indPerRank]; + } + std::cout << std::endl; + } + } + else + { + std::cout << ":::::::: Trajectory " << traj << std::endl; + } + // load data for (auto &p: par.a2aMatrix) { std::string filename = p.file; - double t, size; + double t; tokenReplace(filename, "traj", traj); std::cout << "======== Loading '" << filename << "'" << std::endl; + BARRIER(); A2AMatrixIo a2aIo(filename, p.dataset, par.global.nt); a2aIo.load(a2aMat.at(p.name), &t); - std::cout << "Read " << a2aIo.getSize() << " bytes in " << t/1.0e6 - << " sec, " << a2aIo.getSize()/t*1.0e6/1024/1024 << " MB/s" << std::endl; + GLOBAL_DMAX(t); + std::cout << "Read " << nMpi*a2aIo.getSize() << " bytes in " << t/1.0e6 + << " sec, " << nMpi*a2aIo.getSize()/t*1.0e6/1024/1024 + << " MB/s" << std::endl; } // contract @@ -308,6 +293,7 @@ int main(int argc, char* argv[]) double fusec, busec, flops, bytes, tusec; Contractor::CorrelatorResult result; + BARRIER(); tAr.startTimer("Total"); std::cout << "======== Contraction tr("; for (unsigned int g = 0; g < term.size(); ++g) @@ -358,9 +344,10 @@ int main(int argc, char* argv[]) } tAr.stopTimer("Transpose caching"); } - bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols()*sizeof(ComplexD); - std::cout << Sec(tAr.getDTimer("Transpose caching")) << " " - << Bytes(bytes, tAr.getDTimer("Transpose caching")) << std::endl; + bytes = par.global.nt*lastTerm[0].rows()*lastTerm[0].cols(); + bytes *= sizeof(ComplexD)*nMpi; + printPerf(bytes, tAr.getDTimer("Transpose caching")); + std::cout << std::endl; for (unsigned int i = 0; i < timeSeq.size(); ++i) { unsigned int dti = 0; @@ -405,9 +392,9 @@ int main(int argc, char* argv[]) } if (term.size() > 2) { - std::cout << Sec(tAr.getDTimer("A*B total") - busec) << " " - << Flops(flops, tAr.getDTimer("A*B algebra") - fusec) << " " - << Bytes(bytes, tAr.getDTimer("A*B total") - busec) << std::endl; + printPerf(bytes*nMpi, tAr.getDTimer("A*B total") - busec, + flops*nMpi, tAr.getDTimer("A*B algebra") - fusec); + std::cout << std::endl; } std::cout << std::setw(8) << "traces"; flops = 0.; @@ -423,9 +410,9 @@ int main(int argc, char* argv[]) bytes += 2.*prod.rows()*prod.cols()*sizeof(ComplexD); } tAr.stopTimer("Linear algebra"); - std::cout << Sec(tAr.getDTimer("tr(A*B)") - busec) << " " - << Flops(flops, tAr.getDTimer("tr(A*B)") - fusec) << " " - << Bytes(bytes, tAr.getDTimer("tr(A*B)") - busec) << std::endl; + printPerf(bytes*nMpi, tAr.getDTimer("tr(A*B)") - busec, + flops*nMpi, tAr.getDTimer("tr(A*B)") - fusec); + std::cout << std::endl; if (!p.translationAverage) { saveCorrelator(result, par.global.output, dt, traj); @@ -450,5 +437,7 @@ int main(int argc, char* argv[]) } } + FINALIZE(); + return EXIT_SUCCESS; } diff --git a/Hadrons/Utilities/Contractor.hpp b/Hadrons/Utilities/Contractor.hpp index 9640c7c8..37b1f35d 100644 --- a/Hadrons/Utilities/Contractor.hpp +++ b/Hadrons/Utilities/Contractor.hpp @@ -5,7 +5,48 @@ BEGIN_HADRONS_NAMESPACE +namespace Contractor +{ + class GlobalPar: Serializable + { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(GlobalPar, + TrajRange, trajCounter, + unsigned int, nt, + std::string, diskVectorDir, + std::string, output); + }; + class A2AMatrixPar: Serializable + { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMatrixPar, + std::string, file, + std::string, dataset, + unsigned int, cacheSize, + std::string, name); + }; + + class ProductPar: Serializable + { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(ProductPar, + std::string, terms, + std::vector, times, + std::string, translations, + bool, translationAverage); + }; + + class CorrelatorResult: Serializable + { + public: + GRID_SERIALIZABLE_CLASS_MEMBERS(CorrelatorResult, + std::vector, a2aMatrix, + ProductPar, contraction, + std::vector, times, + std::vector, correlator); + }; +} END_HADRONS_NAMESPACE diff --git a/Hadrons/Utilities/Makefile.am b/Hadrons/Utilities/Makefile.am index 4f324d6d..33f0e8e9 100644 --- a/Hadrons/Utilities/Makefile.am +++ b/Hadrons/Utilities/Makefile.am @@ -7,7 +7,7 @@ HadronsFermionEP64To32_SOURCES = EigenPackCast.cc HadronsFermionEP64To32_CXXFLAGS = $(AM_CXXFLAGS) -DFIN=WilsonImplD::FermionField -DFOUT=WilsonImplF::FermionField HadronsFermionEP64To32_LDADD = ../libHadrons.a ../../Grid/libGrid.a -HadronsContractor_SOURCES = Contractor.cc +HadronsContractor_SOURCES = Contractor.cc Contractor.hpp HadronsContractor_LDADD = ../libHadrons.a ../../Grid/libGrid.a HadronsContractorBenchmark_SOURCES = ContractorBenchmark.cc