From 58567fc650fe36c27b38046e6b3afa394679df76 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 4 Oct 2018 20:01:49 +0100 Subject: [PATCH] Hadrons: big update abstracting the block meson field routine, tested & working, performance counters broken and code dirty --- Hadrons/A2AMatrix.hpp | 307 ++++++++++++++- Hadrons/Makefile.am | 2 + Hadrons/Module.cc | 95 ----- Hadrons/Module.hpp | 13 +- Hadrons/Modules.hpp | 2 +- ...2AMesonFieldKernels.hpp => A2AKernels.hpp} | 207 +++++++++- .../Modules/MContraction/A2AMesonField.hpp | 371 +++++++++++------- Hadrons/TimerArray.cc | 99 +++++ Hadrons/TimerArray.hpp | 29 ++ Hadrons/modules.inc | 2 +- 10 files changed, 844 insertions(+), 283 deletions(-) rename Hadrons/Modules/MContraction/{A2AMesonFieldKernels.hpp => A2AKernels.hpp} (52%) create mode 100644 Hadrons/TimerArray.cc create mode 100644 Hadrons/TimerArray.hpp diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp index 22eeddd1..e58ae71b 100644 --- a/Hadrons/A2AMatrix.hpp +++ b/Hadrons/A2AMatrix.hpp @@ -7,6 +7,7 @@ Source file: Hadrons/A2AMatrix.hpp Copyright (C) 2015-2018 Author: Antonin Portelli +Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,10 +30,28 @@ See the full license in the file "LICENSE" in the top level distribution directo #define A2A_Matrix_hpp_ #include +#include +#include + +#ifndef HADRONS_A2AM_NAME +#define HADRONS_A2AM_NAME "a2aMatrix" +#endif + +#define HADRONS_A2AM_PARALLEL_IO BEGIN_HADRONS_NAMESPACE -template +// general A2A matrix set class based on Eigen tensors and Grid-allocated memory +// Dimensions: +// 0 - ext - external field (momentum, EM field, ...) +// 1 - str - spin-color structure +// 2 - t - timeslice +// 3 - i - left A2A mode index +// 4 - j - right A2A mode index +template +using A2AMatrixSet = Eigen::TensorMap>; + +template class A2AMatrixIo { public: @@ -41,26 +60,265 @@ public: const unsigned int nt, const unsigned int ni, const unsigned int nj); ~A2AMatrixIo(void) = default; + template void initFile(const MetadataType &d, const unsigned int chunkSize); void saveBlock(const T *data, const unsigned int i, const unsigned int j, const unsigned int blockSizei, const unsigned int blockSizej); + void saveBlock(const A2AMatrixSet &m, const unsigned int ext, const unsigned int str, + const unsigned int i, const unsigned int j); private: std::string filename_, dataname_; unsigned int nt_, ni_, nj_; }; -template -A2AMatrixIo::A2AMatrixIo(std::string filename, - std::string dataname, - const unsigned int nt, - const unsigned int ni, - const unsigned int nj) +template +class A2AKernel +{ +public: + A2AKernel(void) = default; + virtual ~A2AKernel(void) = default; + virtual void operator()(A2AMatrixSet &m, const Field *left, const Field *right, + const unsigned int orthogDim, double &time) = 0; + virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0; + virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0; +}; + +template +class A2AMatrixBlockComputation +{ +private: + struct IoHelper + { + A2AMatrixIo io; + MetadataType md; + unsigned int e, s, i, j; + }; + typedef std::function FilenameFn; + typedef std::function MetadataFn; +public: + A2AMatrixBlockComputation(GridBase *grid, + const unsigned int orthogDim, + const unsigned int next, + const unsigned int nstr, + const unsigned int blockSize, + const unsigned int cacheBlockSize, + TimerArray *tArray = nullptr); + void execute(const std::vector &left, + const std::vector &right, + A2AKernel &kernel, + const FilenameFn &ionameFn, + const FilenameFn &filenameFn, + const MetadataFn &metadataFn); +private: + void saveBlock(const A2AMatrixSet &m, IoHelper &h); +private: + TimerArray *tArray_; + GridBase *grid_; + unsigned int orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_; + Vector mCache_; + Vector mBuf_; + std::vector nodeIo_; +}; + +template +A2AMatrixBlockComputation +::A2AMatrixBlockComputation(GridBase *grid, + const unsigned int orthogDim, + const unsigned int next, + const unsigned int nstr, + const unsigned int blockSize, + const unsigned int cacheBlockSize, + TimerArray *tArray) +: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim) +, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize) +, tArray_(tArray) +{ + std::cout << nt_ << std::endl; + std::cout << next_ << std::endl; + std::cout << nstr_ << std::endl; + std::cout << cacheBlockSize_ << std::endl; + std::cout << blockSize_ << std::endl; + mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_); + mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_); + std::cout << mCache_.size() << std::endl; + std::cout << mBuf_.size() << std::endl; + std::cout << blockSize << std::endl; + std::cout << cacheBlockSize << std::endl; +} + +#define START_TIMER(name) if (tArray_) tArray_->startTimer(name) +#define STOP_TIMER(name) if (tArray_) tArray_->stopTimer(name) +#define GET_TIMER(name) (tArray_ != nullptr) ? tArray_->getDTimer(name) : 0. + +template +void A2AMatrixBlockComputation +::execute(const std::vector &left, const std::vector &right, + A2AKernel &kernel, const FilenameFn &ionameFn, + const FilenameFn &filenameFn, const MetadataFn &metadataFn) +{ + ////////////////////////////////////////////////////////////////////////// + // i,j is first loop over SchurBlock factors reusing 5D matrices + // ii,jj is second loop over cacheBlock factors for high perf contractoin + // iii,jjj are loops within cacheBlock + // Total index is sum of these i+ii+iii etc... + ////////////////////////////////////////////////////////////////////////// + int N_i = left.size(); + int N_j = right.size(); + double flops; + double bytes; + double t_kernel = 0.; + double nodes = grid_->NodeCount(); + double tot_kernel; + + double t0 = usecond(); + int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0); + int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0); + + for(int i=0;i mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj); + + // Series of cache blocked chunks of the contractions within this block + flops = 0.0; + bytes = 0.0; + for(int ii=0;ii mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj); + + // makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, + // env().getNd() - 1, this); + START_TIMER("kernel"); + kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, tot_kernel); + STOP_TIMER("kernel"); + + // flops for general N_c & N_s + // flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; + // bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj + // + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; + flops += kernel.flops(N_iii, N_jjj); + bytes += kernel.bytes(N_iii, N_jjj); + + START_TIMER("cache copy"); + parallel_for_nest5(int e =0;eThisRank(), + nRank = grid_->RankCount(); + + LOG(Message) << "Writing block to disk" << std::endl; + ioTime = -GET_TIMER("IO: write block"); + START_TIMER("IO: total"); + makeFileDir(filenameFn(0, 0), grid_); +#ifdef HADRONS_A2AM_PARALLEL_IO + grid_->Barrier(); + // make task list for current node + nodeIo_.clear(); + for(int f = myRank; f < next_*nstr_; f += nRank) + { + IoHelper h; + + h.i = i; + h.j = j; + h.e = f/nstr_; + h.s = f % nstr_; + h.io = A2AMatrixIo(filenameFn(h.e, h.s), + ionameFn(h.e, h.s), nt_, N_i, N_j); + h.md = metadataFn(h.e, h.s); + nodeIo_.push_back(h); + } + // parallel IO + for (auto &h: nodeIo_) + { + saveBlock(mBlock, h); + } + grid_->Barrier(); +#else + // serial IO, for testing purposes only + for(int e = 0; e < next_; e++) + for(int s = 0; s < nstr_; s++) + { + IoHelper h; + + h.i = i; + h.j = j; + h.e = e; + h.s = s; + h.io = A2AMatrixIo(filenameFn(h.e, h.s), + ionameFn(h.e, h.s), nt_, N_i, N_j); + h.md = metadataFn(h.e, h.s); + saveBlock(mfBlock, h); + } +#endif + STOP_TIMER("IO: total"); + blockSize = static_cast(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo)); + ioTime += GET_TIMER("IO: write block"); + LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in " + << ioTime << " us (" + << blockSize/ioTime*1.0e6/1024/1024 + << " MB/s)" << std::endl; + } +} + +template +void A2AMatrixBlockComputation +::saveBlock(const A2AMatrixSet &m, IoHelper &h) +{ + if ((h.i == 0) and (h.j == 0)) + { + START_TIMER("IO: file creation"); + h.io.initFile(h.md, blockSize_); + STOP_TIMER("IO: file creation"); + } + START_TIMER("IO: write block"); + h.io.saveBlock(m, h.e, h.s, h.i, h.j); + STOP_TIMER("IO: write block"); +} + +template +A2AMatrixIo::A2AMatrixIo(std::string filename, std::string dataname, + const unsigned int nt, const unsigned int ni, + const unsigned int nj) : filename_(filename), dataname_(dataname) , nt_(nt), ni_(ni), nj_(nj) {} -template -void A2AMatrixIo::initFile(const MetadataType &d, const unsigned int chunkSize) +template +template +void A2AMatrixIo::initFile(const MetadataType &d, const unsigned int chunkSize) { #ifdef HAVE_HDF5 std::vector dim = {static_cast(nt_), @@ -85,18 +343,18 @@ void A2AMatrixIo::initFile(const MetadataType &d, const unsigne push(reader, dataname_); auto &group = reader.getGroup(); plist.setChunk(chunk.size(), chunk.data()); - dataset = group.createDataSet("data", Hdf5Type::type(), dataspace, plist); + dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type::type(), dataspace, plist); #else HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); #endif } -template -void A2AMatrixIo::saveBlock(const T *data, - const unsigned int i, - const unsigned int j, - const unsigned int blockSizei, - const unsigned int blockSizej) +template +void A2AMatrixIo::saveBlock(const T *data, + const unsigned int i, + const unsigned int j, + const unsigned int blockSizei, + const unsigned int blockSizej) { #ifdef HAVE_HDF5 Hdf5Reader reader(filename_); @@ -111,7 +369,7 @@ void A2AMatrixIo::saveBlock(const T *data, push(reader, dataname_); auto &group = reader.getGroup(); - dataset = group.openDataSet("data"); + dataset = group.openDataSet(HADRONS_A2AM_NAME); dataspace = dataset.getSpace(); dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(), stride.data(), block.data()); @@ -121,6 +379,21 @@ void A2AMatrixIo::saveBlock(const T *data, #endif } +template +void A2AMatrixIo::saveBlock(const A2AMatrixSet &m, + const unsigned int ext, const unsigned int str, + const unsigned int i, const unsigned int j) +{ + unsigned int blockSizei = m.dimension(3); + unsigned int blockSizej = m.dimension(4); + unsigned int nstr = m.dimension(1); + size_t offset = (ext*nstr + str)*nt_*blockSizei*blockSizej; + + saveBlock(m.data() + offset, i, j, blockSizei, blockSizej); +} + + + END_HADRONS_NAMESPACE #endif // A2A_Matrix_hpp_ diff --git a/Hadrons/Makefile.am b/Hadrons/Makefile.am index 4ec45c03..206011f4 100644 --- a/Hadrons/Makefile.am +++ b/Hadrons/Makefile.am @@ -11,6 +11,7 @@ libHadrons_a_SOURCES = \ Exceptions.cc \ Global.cc \ Module.cc \ + TimerArray.cc \ VirtualMachine.cc libHadrons_adir = $(includedir)/Hadrons nobase_libHadrons_a_HEADERS = \ @@ -31,4 +32,5 @@ nobase_libHadrons_a_HEADERS = \ Modules.hpp \ ModuleFactory.hpp \ Solver.hpp \ + TimerArray.hpp \ VirtualMachine.hpp diff --git a/Hadrons/Module.cc b/Hadrons/Module.cc index 3a610ae6..3d4e834a 100644 --- a/Hadrons/Module.cc +++ b/Hadrons/Module.cc @@ -66,101 +66,6 @@ void ModuleBase::operator()(void) stopAllTimers(); } -// timers ////////////////////////////////////////////////////////////////////// -void ModuleBase::startTimer(const std::string &name) -{ - if (!name.empty()) - { - timer_[name].Start(); - } -} - -GridTime ModuleBase::getTimer(const std::string &name) -{ - GridTime t; - - if (!name.empty()) - { - try - { - bool running = timer_.at(name).isRunning(); - - if (running) stopTimer(name); - t = timer_.at(name).Elapsed(); - if (running) startTimer(name); - } - catch (std::out_of_range &) - { - t = GridTime::zero(); - } - } - else - { - t = GridTime::zero(); - } - - return t; -} - -double ModuleBase::getDTimer(const std::string &name) -{ - return static_cast(getTimer(name).count()); -} - -void ModuleBase::startCurrentTimer(const std::string &name) -{ - if (!name.empty()) - { - stopCurrentTimer(); - startTimer(name); - currentTimer_ = name; - } -} - -void ModuleBase::stopTimer(const std::string &name) -{ - if (timer_.at(name).isRunning()) - { - timer_.at(name).Stop(); - } -} - -void ModuleBase::stopCurrentTimer(void) -{ - if (!currentTimer_.empty()) - { - stopTimer(currentTimer_); - currentTimer_ = ""; - } -} - -void ModuleBase::stopAllTimers(void) -{ - for (auto &t: timer_) - { - stopTimer(t.first); - } - currentTimer_ = ""; -} - -void ModuleBase::resetTimers(void) -{ - timer_.clear(); - currentTimer_ = ""; -} - -std::map ModuleBase::getTimings(void) -{ - std::map timing; - - for (auto &t: timer_) - { - timing[t.first] = t.second.Elapsed(); - } - - return timing; -} - std::string ModuleBase::makeSeedString(void) { std::string seed; diff --git a/Hadrons/Module.hpp b/Hadrons/Module.hpp index 083040ce..752b821e 100644 --- a/Hadrons/Module.hpp +++ b/Hadrons/Module.hpp @@ -30,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo #define Hadrons_Module_hpp_ #include +#include #include BEGIN_HADRONS_NAMESPACE @@ -152,7 +153,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\ * Module class * ******************************************************************************/ // base class -class ModuleBase +class ModuleBase: public TimerArray { public: // constructor @@ -180,16 +181,6 @@ public: virtual void execute(void) = 0; // execution void operator()(void); - // timers - void startTimer(const std::string &name); - GridTime getTimer(const std::string &name); - double getDTimer(const std::string &name); - void startCurrentTimer(const std::string &name); - void stopTimer(const std::string &name); - void stopCurrentTimer(void); - void stopAllTimers(void); - void resetTimers(void); - std::map getTimings(void); protected: // environment shortcut DEFINE_ENV_ALIAS; diff --git a/Hadrons/Modules.hpp b/Hadrons/Modules.hpp index 5dcb2483..634d588d 100644 --- a/Hadrons/Modules.hpp +++ b/Hadrons/Modules.hpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp b/Hadrons/Modules/MContraction/A2AKernels.hpp similarity index 52% rename from Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp rename to Hadrons/Modules/MContraction/A2AKernels.hpp index 76317ea0..44959142 100644 --- a/Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp +++ b/Hadrons/Modules/MContraction/A2AKernels.hpp @@ -31,7 +31,6 @@ See the full license in the file "LICENSE" in the top level distribution directo #include #include -#include BEGIN_HADRONS_NAMESPACE @@ -45,10 +44,10 @@ template void makeMesonFieldBlock(MesonField &mat, const Field *lhs_wi, const Field *rhs_vj, - std::vector gamma, + const std::vector &gamma, const std::vector &mom, int orthogdim, - ModuleBase *caller = nullptr) + double &time) { typedef typename Field::vector_object vobj; typedef typename vobj::scalar_object sobj; @@ -58,6 +57,8 @@ void makeMesonFieldBlock(MesonField &mat, typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; + TimerArray tArray; + int Lblock = mat.dimension(3); int Rblock = mat.dimension(4); @@ -96,7 +97,7 @@ void makeMesonFieldBlock(MesonField &mat, int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - if (caller) caller->startTimer("contraction: colour trace & mom."); + tArray.startTimer("contraction: colour trace & mom."); // Nested parallelism would be ok // Wasting cores here. Test case r parallel_for(int r=0;rstopTimer("contraction: colour trace & mom."); + tArray.stopTimer("contraction: colour trace & mom."); // Sum across simd lanes in the plane, breaking out orthog dir. - if (caller) caller->startTimer("contraction: local space sum"); + tArray.startTimer("contraction: local space sum"); parallel_for(int rt=0;rt icoor(Nd); @@ -166,10 +167,12 @@ void makeMesonFieldBlock(MesonField &mat, } } } - if (caller) caller->stopTimer("contraction: local space sum"); + tArray.stopTimer("contraction: local space sum"); + time = tArray.getDTimer("contraction: colour trace & mom.") + + tArray.getDTimer("contraction: local space sum"); // ld loop and local only?? - if (caller) caller->startTimer("contraction: spin trace"); + tArray.startTimer("contraction: spin trace"); int pd = grid->_processors[orthogdim]; int pc = grid->_processor_coor[orthogdim]; parallel_for_nest2(int lt=0;ltstopTimer("contraction: spin trace"); + tArray.stopTimer("contraction: spin trace"); //////////////////////////////////////////////////////////////////// // This global sum is taking as much as 50% of time on 16 nodes // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume // Healthy size that should suffice //////////////////////////////////////////////////////////////////// - if (caller) caller->startTimer("contraction: global sum"); + tArray.startTimer("contraction: global sum"); grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); + tArray.stopTimer("contraction: global sum"); +} + +template +void makeAslashFieldBlock(AslashField &mat, + const Field *lhs_wi, + const Field *rhs_vj, + const std::vector &emB0, + const std::vector &emB1, + int orthogdim, + ModuleBase *caller = nullptr) +{ + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + + int Lblock = mat.dimension(2); + int Rblock = mat.dimension(3); + + GridBase *grid = lhs_wi[0]._grid; + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Nem = emB0.size(); + assert(emB1.size() == Nem); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nem; + int MFlvol = ld*Lblock*Rblock*Nem; + + Vector lvSum(MFrvol); + parallel_for (int r = 0; r < MFrvol; r++) + { + lvSum[r] = zero; + } + + Vector lsSum(MFlvol); + parallel_for (int r = 0; r < MFlvol; r++) + { + lsSum[r] = scalar_type(0.0); + } + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + if (caller) caller->startTimer("contraction: colour trace & Aslash mul"); + // Nested parallelism would be ok + // Wasting cores here. Test case r + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;nstopTimer("contraction: colour trace & Aslash mul"); + + // Sum across simd lanes in the plane, breaking out orthog dir. + if (caller) caller->startTimer("contraction: local space sum"); + parallel_for(int rt=0;rt icoor(Nd); + std::vector extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + } + } + } + if (caller) caller->stopTimer("contraction: local space sum"); + + // ld loop and local only?? + if (caller) caller->startTimer("contraction: tensor store"); + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + parallel_for_nest2(int lt=0;ltstopTimer("contraction: tensor store"); + + if (caller) caller->startTimer("contraction: global sum"); + grid->GlobalSumVector(&mat(0,0,0,0),Nem*Nt*Lblock*Rblock); if (caller) caller->stopTimer("contraction: global sum"); } diff --git a/Hadrons/Modules/MContraction/A2AMesonField.hpp b/Hadrons/Modules/MContraction/A2AMesonField.hpp index 9e07ec6c..4d5f7ddd 100644 --- a/Hadrons/Modules/MContraction/A2AMesonField.hpp +++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp @@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo #include #include #include -#include +#include #define MF_PARALLEL_IO #ifndef MF_IO_TYPE @@ -71,21 +71,62 @@ public: Gamma::Algebra, gamma); }; +template +class MesonFieldKernel: public A2AKernel +{ +public: + MesonFieldKernel(const std::vector &gamma, + const std::vector &mom, + GridBase *grid) + : gamma_(gamma), mom_(mom), grid_(grid) + { + vol_ = 1.; + for (auto &d: grid_->GlobalDimensions()) + { + vol_ *= d; + } + } + + virtual ~MesonFieldKernel(void) = default; + virtual void operator()(A2AMatrixSet &m, const Field *left, const Field *right, + const unsigned int orthogDim, double &time) + { + makeMesonFieldBlock(m, left, right, gamma_, mom_, orthogDim, time); + } + + virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) + { + return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size(); + } + + virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) + { + return vol_*(12.0*sizeof(T))*blockSizei*blockSizej + + vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size(); + } +private: + const std::vector &gamma_; + const std::vector &mom_; + GridBase *grid_; + double vol_; +}; + template class TA2AMesonField : public Module { public: FERM_TYPE_ALIASES(FImpl,); SOLVER_TYPE_ALIASES(FImpl,); - typedef Eigen::TensorMap> MesonField; - typedef Eigen::TensorMap> MesonFieldIo; - typedef A2AMatrixIo MatrixIo; + typedef A2AMatrixBlockComputation Computation; + typedef MesonFieldKernel Kernel; struct IoHelper { - MatrixIo io; - A2AMesonFieldMetadata metadata; - size_t offset; - unsigned int i, j, blockSizei, blockSizej; + A2AMatrixIo io; + A2AMesonFieldMetadata md; + unsigned int m, g, i, j; }; public: // constructor @@ -103,13 +144,13 @@ private: // IO std::string ioname(const unsigned int m, const unsigned int g) const; std::string filename(const unsigned int m, const unsigned int g) const; - void saveBlock(const MF_IO_TYPE *data, IoHelper &h); + void saveBlock(const A2AMatrixSet &mf, IoHelper &h); private: - bool hasPhase_{false}; - std::string momphName_; - std::vector gamma_; - std::vector> mom_; - std::vector nodeIo_; + bool hasPhase_{false}; + std::string momphName_; + std::vector gamma_; + std::vector> mom_; + std::vector nodeIo_; }; MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField), MContraction); @@ -190,11 +231,9 @@ void TA2AMesonField::setup(void) envCache(std::vector, momphName_, 1, par().mom.size(), envGetGrid(ComplexField)); envTmpLat(ComplexField, "coor"); - // preallocate memory for meson field block - auto tgp = env().getDim().back()*gamma_.size()*mom_.size(); - - envTmp(Vector, "mfBuf", 1, tgp*par().block*par().block); - envTmp(Vector, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock); + envTmp(Computation, "computation", 1, envGetGrid(FermionField), + env().getNd() - 1, mom_.size(), gamma_.size(), par().block, + par().cacheBlock, this); } // execution /////////////////////////////////////////////////////////////////// @@ -253,7 +292,43 @@ void TA2AMesonField::execute(void) hasPhase_ = true; stopTimer("Momentum phases"); } - + + auto ionameFn = [this](const unsigned int m, const unsigned int g) + { + std::stringstream ss; + + ss << gamma_[g] << "_"; + for (unsigned int mu = 0; mu < mom_[m].size(); ++mu) + { + ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_"); + } + + return ss.str(); + }; + + auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g) + { + return par().output + "." + std::to_string(vm().getTrajectory()) + + "/" + ioname(m, g) + ".h5"; + }; + + auto metadataFn = [this](const unsigned int m, const unsigned int g) + { + A2AMesonFieldMetadata md; + + for (auto pmu: mom_[m]) + { + md.momentum.push_back(pmu); + } + md.gamma = gamma_[g]; + + return md; + }; + + Kernel kernel(gamma_, ph, envGetGrid(FermionField)); + + envGetTmp(Computation, computation); + computation.execute(w, v, kernel, ionameFn, filenameFn, metadataFn); ////////////////////////////////////////////////////////////////////////// // i,j is first loop over SchurBlock factors reusing 5D matrices // ii,jj is second loop over cacheBlock factors for high perf contractoin @@ -261,145 +336,145 @@ void TA2AMesonField::execute(void) // Total index is sum of these i+ii+iii etc... ////////////////////////////////////////////////////////////////////////// - double flops; - double bytes; - double vol = env().getVolume(); - double t_kernel = 0.0; - double nodes = env().getGrid()->NodeCount(); - double tot_kernel; +// double flops; +// double bytes; +// double vol = env().getVolume(); +// double t_kernel = 0.0; +// double nodes = env().getGrid()->NodeCount(); +// double tot_kernel; - envGetTmp(Vector, mfBuf); - envGetTmp(Vector, mfCache); +// envGetTmp(Vector, mfBuf); +// envGetTmp(Vector, mfCache); - double t0 = usecond(); - int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0); - int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0); +// double t0 = usecond(); +// int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0); +// int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0); - for(int i=0;i mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj); - // Series of cache blocked chunks of the contractions within this block - flops = 0.0; - bytes = 0.0; - for(int ii=0;ii mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj); - startTimer("contraction: total"); - makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, - env().getNd() - 1, this); - stopTimer("contraction: total"); +// startTimer("contraction: total"); +// makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, +// env().getNd() - 1, this); +// stopTimer("contraction: total"); - // flops for general N_c & N_s - flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; - bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj - + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; +// // flops for general N_c & N_s +// flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; +// bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj +// + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; - startTimer("cache copy"); - parallel_for_nest5(int m =0;m< nmom;m++) - for(int g =0;g< ngamma;g++) - for(int t =0;t< nt;t++) - for(int iii=0;iii< N_iii;iii++) - for(int jjj=0;jjj< N_jjj;jjj++) - { - mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj); - } - stopTimer("cache copy"); - } +// startTimer("cache copy"); +// parallel_for_nest5(int m =0;m< nmom;m++) +// for(int g =0;g< ngamma;g++) +// for(int t =0;t< nt;t++) +// for(int iii=0;iii< N_iii;iii++) +// for(int jjj=0;jjj< N_jjj;jjj++) +// { +// mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj); +// } +// stopTimer("cache copy"); +// } - // perf - tot_kernel = getDTimer("contraction: colour trace & mom.") - + getDTimer("contraction: local space sum"); - t_kernel = tot_kernel - t_kernel; - LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes - << " Gflop/s/node " << std::endl; - LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes - << " GB/s/node " << std::endl; - t_kernel = tot_kernel; +// // perf +// tot_kernel = getDTimer("contraction: colour trace & mom.") +// + getDTimer("contraction: local space sum"); +// t_kernel = tot_kernel - t_kernel; +// LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes +// << " Gflop/s/node " << std::endl; +// LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes +// << " GB/s/node " << std::endl; +// t_kernel = tot_kernel; - // IO - if (!par().output.empty()) - { - double blockSize, ioTime; - unsigned int myRank = env().getGrid()->ThisRank(), - nRank = env().getGrid()->RankCount(); +// // IO +// if (!par().output.empty()) +// { +// double blockSize, ioTime; +// unsigned int myRank = env().getGrid()->ThisRank(), +// nRank = env().getGrid()->RankCount(); - LOG(Message) << "Writing block to disk" << std::endl; - ioTime = -getDTimer("IO: write block"); - startTimer("IO: total"); - makeFileDir(filename(0, 0), env().getGrid()); -#ifdef MF_PARALLEL_IO - env().getGrid()->Barrier(); - nodeIo_.clear(); - for(int f = myRank; f < nmom*ngamma; f += nRank) - { - const unsigned int m = f/ngamma, g = f % ngamma; - IoHelper h; +// LOG(Message) << "Writing block to disk" << std::endl; +// ioTime = -getDTimer("IO: write block"); +// startTimer("IO: total"); +// makeFileDir(filename(0, 0), env().getGrid()); +// #ifdef MF_PARALLEL_IO +// env().getGrid()->Barrier(); +// // make task list for current node +// nodeIo_.clear(); +// for(int f = myRank; f < nmom*ngamma; f += nRank) +// { +// IoHelper h; - h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j); - for (auto pmu: mom_[m]) - { - h.metadata.momentum.push_back(pmu); - } - h.metadata.gamma = gamma_[g]; - h.i = i; - h.j = j; - h.blockSizei = mfBlock.dimension(3); - h.blockSizej = mfBlock.dimension(4); - h.offset = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej; - nodeIo_.push_back(h); - } - // parallel IO - for (auto &h: nodeIo_) - { - saveBlock(mfBlock.data(), h); - } - env().getGrid()->Barrier(); -#else - // serial IO - for(int m = 0; m < nmom; m++) - for(int g = 0; g < ngamma; g++) - { - IoHelper h; +// h.i = i; +// h.j = j; +// h.m = f/ngamma; +// h.g = f % ngamma; +// h.io = A2AMatrixIo(filename(h.m, h.g), +// ioname(h.m, h.g), nt, N_i, N_j); +// for (auto pmu: mom_[h.m]) +// { +// h.md.momentum.push_back(pmu); +// } +// h.md.gamma = gamma_[h.g]; +// nodeIo_.push_back(h); +// } +// // parallel IO +// for (auto &h: nodeIo_) +// { +// saveBlock(mfBlock, h); +// } +// env().getGrid()->Barrier(); +// #else +// // serial IO, for testing purposes only +// for(int m = 0; m < nmom; m++) +// for(int g = 0; g < ngamma; g++) +// { +// IoHelper h; - h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j); - for (auto pmu: mom_[m]) - { - h.metadata.momentum.push_back(pmu); - } - h.metadata.gamma = gamma_[g]; - h.i = i; - h.j = j; - h.blockSizei = mfBlock.dimension(3); - h.blockSizej = mfBlock.dimension(4); - h.offset = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej; - saveBlock(mfBlock.data(), h); - } -#endif - stopTimer("IO: total"); - blockSize = static_cast(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE)); - ioTime += getDTimer("IO: write block"); - LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in " - << ioTime << " us (" - << blockSize/ioTime*1.0e6/1024/1024 - << " MB/s)" << std::endl; - } - } +// h.i = i; +// h.j = j; +// h.m = m; +// h.g = g; +// h.io = A2AMatrixIo(filename(h.m, h.g), +// ioname(h.m, h.g), nt, N_i, N_j); +// for (auto pmu: mom_[h.m]) +// { +// h.md.momentum.push_back(pmu); +// } +// h.md.gamma = gamma_[h.g]; +// saveBlock(mfBlock, h); +// } +// #endif +// stopTimer("IO: total"); +// blockSize = static_cast(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE)); +// ioTime += getDTimer("IO: write block"); +// LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in " +// << ioTime << " us (" +// << blockSize/ioTime*1.0e6/1024/1024 +// << " MB/s)" << std::endl; +// } +// } } // IO @@ -425,16 +500,16 @@ std::string TA2AMesonField::filename(unsigned int m, unsigned int g) cons } template -void TA2AMesonField::saveBlock(const MF_IO_TYPE *data, IoHelper &h) +void TA2AMesonField::saveBlock(const A2AMatrixSet &mf, IoHelper &h) { if ((h.i == 0) and (h.j == 0)) { startTimer("IO: file creation"); - h.io.initFile(h.metadata, par().block); + h.io.initFile(h.md, par().block); stopTimer("IO: file creation"); } startTimer("IO: write block"); - h.io.saveBlock(data + h.offset, h.i, h.j, h.blockSizei, h.blockSizej); + h.io.saveBlock(mf, h.m, h.g, h.i, h.j); stopTimer("IO: write block"); } diff --git a/Hadrons/TimerArray.cc b/Hadrons/TimerArray.cc new file mode 100644 index 00000000..c99ed41f --- /dev/null +++ b/Hadrons/TimerArray.cc @@ -0,0 +1,99 @@ +#include + +using namespace Grid; +using namespace QCD; +using namespace Hadrons; + +void TimerArray::startTimer(const std::string &name) +{ + if (!name.empty()) + { + timer_[name].Start(); + } +} + +GridTime TimerArray::getTimer(const std::string &name) +{ + GridTime t; + + if (!name.empty()) + { + try + { + bool running = timer_.at(name).isRunning(); + + if (running) stopTimer(name); + t = timer_.at(name).Elapsed(); + if (running) startTimer(name); + } + catch (std::out_of_range &) + { + t = GridTime::zero(); + } + } + else + { + t = GridTime::zero(); + } + + return t; +} + +double TimerArray::getDTimer(const std::string &name) +{ + return static_cast(getTimer(name).count()); +} + +void TimerArray::startCurrentTimer(const std::string &name) +{ + if (!name.empty()) + { + stopCurrentTimer(); + startTimer(name); + currentTimer_ = name; + } +} + +void TimerArray::stopTimer(const std::string &name) +{ + if (timer_.at(name).isRunning()) + { + timer_.at(name).Stop(); + } +} + +void TimerArray::stopCurrentTimer(void) +{ + if (!currentTimer_.empty()) + { + stopTimer(currentTimer_); + currentTimer_ = ""; + } +} + +void TimerArray::stopAllTimers(void) +{ + for (auto &t: timer_) + { + stopTimer(t.first); + } + currentTimer_ = ""; +} + +void TimerArray::resetTimers(void) +{ + timer_.clear(); + currentTimer_ = ""; +} + +std::map TimerArray::getTimings(void) +{ + std::map timing; + + for (auto &t: timer_) + { + timing[t.first] = t.second.Elapsed(); + } + + return timing; +} diff --git a/Hadrons/TimerArray.hpp b/Hadrons/TimerArray.hpp new file mode 100644 index 00000000..477f11cd --- /dev/null +++ b/Hadrons/TimerArray.hpp @@ -0,0 +1,29 @@ +#ifndef Hadrons_TimerArray_hpp_ +#define Hadrons_TimerArray_hpp_ + +#include + +BEGIN_HADRONS_NAMESPACE + +class TimerArray +{ +public: + TimerArray(void) = default; + virtual ~TimerArray(void) = default; + void startTimer(const std::string &name); + GridTime getTimer(const std::string &name); + double getDTimer(const std::string &name); + void startCurrentTimer(const std::string &name); + void stopTimer(const std::string &name); + void stopCurrentTimer(void); + void stopAllTimers(void); + void resetTimers(void); + std::map getTimings(void); +private: + std::string currentTimer_; + std::map timer_; +}; + +END_HADRONS_NAMESPACE + +#endif // Hadrons_TimerArray_hpp_ diff --git a/Hadrons/modules.inc b/Hadrons/modules.inc index 69637a68..3ddc3aa8 100644 --- a/Hadrons/modules.inc +++ b/Hadrons/modules.inc @@ -64,7 +64,7 @@ modules_cc =\ modules_hpp =\ Modules/MContraction/Baryon.hpp \ Modules/MContraction/A2AMesonField.hpp \ - Modules/MContraction/A2AMesonFieldKernels.hpp \ + Modules/MContraction/A2AKernels.hpp \ Modules/MContraction/Meson.hpp \ Modules/MContraction/WeakHamiltonian.hpp \ Modules/MContraction/WeakHamiltonianNonEye.hpp \