diff --git a/Hadrons/A2AMatrix.hpp b/Hadrons/A2AMatrix.hpp index 42d7b0ca..95133f30 100644 --- a/Hadrons/A2AMatrix.hpp +++ b/Hadrons/A2AMatrix.hpp @@ -41,7 +41,7 @@ See the full license in the file "LICENSE" in the top level distribution directo BEGIN_HADRONS_NAMESPACE -// general A2A matrix set class based on Eigen tensors and Grid-allocated memory +// general A2A matrix set based on Eigen tensors and Grid-allocated memory // Dimensions: // 0 - ext - external field (momentum, EM field, ...) // 1 - str - spin-color structure @@ -51,26 +51,9 @@ BEGIN_HADRONS_NAMESPACE template using A2AMatrixSet = Eigen::TensorMap>; -template -class A2AMatrixIo -{ -public: - A2AMatrixIo(void) = default; - A2AMatrixIo(std::string filename, std::string dataname, - const unsigned int nt, const unsigned int ni, - const unsigned int nj); - ~A2AMatrixIo(void) = default; - template - void initFile(const MetadataType &d, const unsigned int chunkSize); - void saveBlock(const T *data, const unsigned int i, const unsigned int j, - const unsigned int blockSizei, const unsigned int blockSizej); - void saveBlock(const A2AMatrixSet &m, const unsigned int ext, const unsigned int str, - const unsigned int i, const unsigned int j); -private: - std::string filename_, dataname_; - unsigned int nt_, ni_, nj_; -}; - +/****************************************************************************** + * Abstract class for A2A kernels * + ******************************************************************************/ template class A2AKernel { @@ -83,6 +66,36 @@ public: virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0; }; +/****************************************************************************** + * Class to handle A2A matrix block HDF5 I/O * + ******************************************************************************/ +template +class A2AMatrixIo +{ +public: + // constructors + A2AMatrixIo(void) = default; + A2AMatrixIo(std::string filename, std::string dataname, + const unsigned int nt, const unsigned int ni, + const unsigned int nj); + // destructor + ~A2AMatrixIo(void) = default; + // file allocation + template + void initFile(const MetadataType &d, const unsigned int chunkSize); + // block I/O + void saveBlock(const T *data, const unsigned int i, const unsigned int j, + const unsigned int blockSizei, const unsigned int blockSizej); + void saveBlock(const A2AMatrixSet &m, const unsigned int ext, const unsigned int str, + const unsigned int i, const unsigned int j); +private: + std::string filename_, dataname_; + unsigned int nt_, ni_, nj_; +}; + +/****************************************************************************** + * Wrapper for A2A matrix block computation * + ******************************************************************************/ template class A2AMatrixBlockComputation { @@ -96,6 +109,7 @@ private: typedef std::function FilenameFn; typedef std::function MetadataFn; public: + // constructor A2AMatrixBlockComputation(GridBase *grid, const unsigned int orthogDim, const unsigned int next, @@ -103,6 +117,7 @@ public: const unsigned int blockSize, const unsigned int cacheBlockSize, TimerArray *tArray = nullptr); + // execution void execute(const std::vector &left, const std::vector &right, A2AKernel &kernel, @@ -110,6 +125,7 @@ public: const FilenameFn &filenameFn, const MetadataFn &metadataFn); private: + // I/O handler void saveBlock(const A2AMatrixSet &m, IoHelper &h); private: TimerArray *tArray_; @@ -120,6 +136,100 @@ private: std::vector nodeIo_; }; +/****************************************************************************** + * A2AMatrixIo template implementation * + ******************************************************************************/ +// constructor ///////////////////////////////////////////////////////////////// +template +A2AMatrixIo::A2AMatrixIo(std::string filename, std::string dataname, + const unsigned int nt, const unsigned int ni, + const unsigned int nj) +: filename_(filename), dataname_(dataname) +, nt_(nt), ni_(ni), nj_(nj) +{} + +// file allocation ///////////////////////////////////////////////////////////// +template +template +void A2AMatrixIo::initFile(const MetadataType &d, const unsigned int chunkSize) +{ +#ifdef HAVE_HDF5 + std::vector dim = {static_cast(nt_), + static_cast(ni_), + static_cast(nj_)}, + chunk = {static_cast(nt_), + static_cast(chunkSize), + static_cast(chunkSize)}; + H5NS::DataSpace dataspace(dim.size(), dim.data()); + H5NS::DataSet dataset; + H5NS::DSetCreatPropList plist; + + // create empty file just with metadata + { + Hdf5Writer writer(filename_); + write(writer, dataname_, d); + } + + // create the dataset + Hdf5Reader reader(filename_); + + push(reader, dataname_); + auto &group = reader.getGroup(); + plist.setChunk(chunk.size(), chunk.data()); + dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type::type(), dataspace, plist); +#else + HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); +#endif +} + +// block I/O /////////////////////////////////////////////////////////////////// +template +void A2AMatrixIo::saveBlock(const T *data, + const unsigned int i, + const unsigned int j, + const unsigned int blockSizei, + const unsigned int blockSizej) +{ +#ifdef HAVE_HDF5 + Hdf5Reader reader(filename_); + std::vector count = {nt_, blockSizei, blockSizej}, + offset = {0, static_cast(i), + static_cast(j)}, + stride = {1, 1, 1}, + block = {1, 1, 1}; + H5NS::DataSpace memspace(count.size(), count.data()), dataspace; + H5NS::DataSet dataset; + size_t shift; + + push(reader, dataname_); + auto &group = reader.getGroup(); + dataset = group.openDataSet(HADRONS_A2AM_NAME); + dataspace = dataset.getSpace(); + dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(), + stride.data(), block.data()); + dataset.write(data, Hdf5Type::type(), memspace, dataspace); +#else + HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); +#endif +} + +template +void A2AMatrixIo::saveBlock(const A2AMatrixSet &m, + const unsigned int ext, const unsigned int str, + const unsigned int i, const unsigned int j) +{ + unsigned int blockSizei = m.dimension(3); + unsigned int blockSizej = m.dimension(4); + unsigned int nstr = m.dimension(1); + size_t offset = (ext*nstr + str)*nt_*blockSizei*blockSizej; + + saveBlock(m.data() + offset, i, j, blockSizei, blockSizej); +} + +/****************************************************************************** + * A2AMatrixBlockComputation template implementation * + ******************************************************************************/ +// constructor ///////////////////////////////////////////////////////////////// template A2AMatrixBlockComputation ::A2AMatrixBlockComputation(GridBase *grid, @@ -133,23 +243,15 @@ A2AMatrixBlockComputation , next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize) , tArray_(tArray) { - std::cout << nt_ << std::endl; - std::cout << next_ << std::endl; - std::cout << nstr_ << std::endl; - std::cout << cacheBlockSize_ << std::endl; - std::cout << blockSize_ << std::endl; mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_); mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_); - std::cout << mCache_.size() << std::endl; - std::cout << mBuf_.size() << std::endl; - std::cout << blockSize << std::endl; - std::cout << cacheBlockSize << std::endl; } #define START_TIMER(name) if (tArray_) tArray_->startTimer(name) #define STOP_TIMER(name) if (tArray_) tArray_->stopTimer(name) #define GET_TIMER(name) ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.) +// execution /////////////////////////////////////////////////////////////////// template void A2AMatrixBlockComputation ::execute(const std::vector &left, const std::vector &right, @@ -157,20 +259,16 @@ void A2AMatrixBlockComputation const FilenameFn &filenameFn, const MetadataFn &metadataFn) { ////////////////////////////////////////////////////////////////////////// - // i,j is first loop over SchurBlock factors reusing 5D matrices - // ii,jj is second loop over cacheBlock factors for high perf contractoin + // i,j is first loop over blockSize_ factors + // ii,jj is second loop over cacheBlockSize_ factors for high perf contractions // iii,jjj are loops within cacheBlock // Total index is sum of these i+ii+iii etc... ////////////////////////////////////////////////////////////////////////// - int N_i = left.size(); - int N_j = right.size(); - double flops; - double bytes; - double t_kernel = 0.; - double nodes = grid_->NodeCount(); - double tot_kernel; + int N_i = left.size(); + int N_j = right.size(); + double flops, bytes, t_kernel; + double nodes = grid_->NodeCount(); - double t0 = usecond(); int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0); int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0); @@ -180,15 +278,13 @@ void A2AMatrixBlockComputation // Get the W and V vectors for this block^2 set of terms int N_ii = MIN(N_i-i,blockSize_); int N_jj = MIN(N_j-j,blockSize_); + A2AMatrixSet mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj); LOG(Message) << "All-to-all matrix block " << j/blockSize_ + NBlock_j*i/blockSize_ + 1 << "/" << NBlock_i*NBlock_j << " [" << i <<" .. " << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" << std::endl; - - A2AMatrixSet mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj); - // Series of cache blocked chunks of the contractions within this block flops = 0.0; bytes = 0.0; @@ -201,18 +297,12 @@ void A2AMatrixBlockComputation int N_jjj = MIN(N_jj-jj,cacheBlockSize_); A2AMatrixSet mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj); - // makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, - // env().getNd() - 1, this); START_TIMER("kernel"); kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t); STOP_TIMER("kernel"); t_kernel += t; - // flops for general N_c & N_s - // flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; - // bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj - // + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; - flops += kernel.flops(N_iii, N_jjj); - bytes += kernel.bytes(N_iii, N_jjj); + flops += kernel.flops(N_iii, N_jjj); + bytes += kernel.bytes(N_iii, N_jjj); START_TIMER("cache copy"); parallel_for_nest5(int e =0;e } // perf - // tot_kernel = getDTimer("contraction: colour trace & mom.") - // + getDTimer("contraction: local space sum"); LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes << " Gflop/s/node " << std::endl; LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes @@ -236,8 +324,7 @@ void A2AMatrixBlockComputation // IO double blockSize, ioTime; - unsigned int myRank = grid_->ThisRank(), - nRank = grid_->RankCount(); + unsigned int myRank = grid_->ThisRank(), nRank = grid_->RankCount(); LOG(Message) << "Writing block to disk" << std::endl; ioTime = -GET_TIMER("IO: write block"); @@ -293,6 +380,7 @@ void A2AMatrixBlockComputation } } +// I/O handler ///////////////////////////////////////////////////////////////// template void A2AMatrixBlockComputation ::saveBlock(const A2AMatrixSet &m, IoHelper &h) @@ -308,91 +396,9 @@ void A2AMatrixBlockComputation STOP_TIMER("IO: write block"); } -template -A2AMatrixIo::A2AMatrixIo(std::string filename, std::string dataname, - const unsigned int nt, const unsigned int ni, - const unsigned int nj) -: filename_(filename), dataname_(dataname) -, nt_(nt), ni_(ni), nj_(nj) -{} - -template -template -void A2AMatrixIo::initFile(const MetadataType &d, const unsigned int chunkSize) -{ -#ifdef HAVE_HDF5 - std::vector dim = {static_cast(nt_), - static_cast(ni_), - static_cast(nj_)}, - chunk = {static_cast(nt_), - static_cast(chunkSize), - static_cast(chunkSize)}; - H5NS::DataSpace dataspace(dim.size(), dim.data()); - H5NS::DataSet dataset; - H5NS::DSetCreatPropList plist; - - // create empty file just with metadata - { - Hdf5Writer writer(filename_); - write(writer, dataname_, d); - } - - // create the dataset - Hdf5Reader reader(filename_); - - push(reader, dataname_); - auto &group = reader.getGroup(); - plist.setChunk(chunk.size(), chunk.data()); - dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type::type(), dataspace, plist); -#else - HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); -#endif -} - -template -void A2AMatrixIo::saveBlock(const T *data, - const unsigned int i, - const unsigned int j, - const unsigned int blockSizei, - const unsigned int blockSizej) -{ -#ifdef HAVE_HDF5 - Hdf5Reader reader(filename_); - std::vector count = {nt_, blockSizei, blockSizej}, - offset = {0, static_cast(i), - static_cast(j)}, - stride = {1, 1, 1}, - block = {1, 1, 1}; - H5NS::DataSpace memspace(count.size(), count.data()), dataspace; - H5NS::DataSet dataset; - size_t shift; - - push(reader, dataname_); - auto &group = reader.getGroup(); - dataset = group.openDataSet(HADRONS_A2AM_NAME); - dataspace = dataset.getSpace(); - dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(), - stride.data(), block.data()); - dataset.write(data, Hdf5Type::type(), memspace, dataspace); -#else - HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); -#endif -} - -template -void A2AMatrixIo::saveBlock(const A2AMatrixSet &m, - const unsigned int ext, const unsigned int str, - const unsigned int i, const unsigned int j) -{ - unsigned int blockSizei = m.dimension(3); - unsigned int blockSizej = m.dimension(4); - unsigned int nstr = m.dimension(1); - size_t offset = (ext*nstr + str)*nt_*blockSizei*blockSizej; - - saveBlock(m.data() + offset, i, j, blockSizei, blockSizej); -} - - +#undef START_TIMER +#undef STOP_TIMER +#undef GET_TIMER END_HADRONS_NAMESPACE diff --git a/Hadrons/Modules/MContraction/A2AMesonField.hpp b/Hadrons/Modules/MContraction/A2AMesonField.hpp index 4d5f7ddd..f2c19507 100644 --- a/Hadrons/Modules/MContraction/A2AMesonField.hpp +++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp @@ -56,8 +56,8 @@ public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2AMesonFieldPar, int, cacheBlock, int, block, - std::string, v, - std::string, w, + std::string, left, + std::string, right, std::string, output, std::string, gammas, std::vector, mom); @@ -140,11 +140,6 @@ public: virtual void setup(void); // execution virtual void execute(void); -private: - // IO - std::string ioname(const unsigned int m, const unsigned int g) const; - std::string filename(const unsigned int m, const unsigned int g) const; - void saveBlock(const A2AMatrixSet &mf, IoHelper &h); private: bool hasPhase_{false}; std::string momphName_; @@ -154,7 +149,6 @@ private: }; MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField), MContraction); -MODULE_REGISTER(ZA2AMesonField, ARG(TA2AMesonField), MContraction); /****************************************************************************** * TA2AMesonField implementation * @@ -171,7 +165,7 @@ TA2AMesonField::TA2AMesonField(const std::string name) template std::vector TA2AMesonField::getInput(void) { - std::vector in = {par().v, par().w}; + std::vector in = {par().left, par().right}; return in; } @@ -227,7 +221,6 @@ void TA2AMesonField::setup(void) } mom_.push_back(p); } - envCache(std::vector, momphName_, 1, par().mom.size(), envGetGrid(ComplexField)); envTmpLat(ComplexField, "coor"); @@ -240,19 +233,19 @@ void TA2AMesonField::setup(void) template void TA2AMesonField::execute(void) { - auto &v = envGet(std::vector, par().v); - auto &w = envGet(std::vector, par().w); + auto &left = envGet(std::vector, par().left); + auto &right = envGet(std::vector, par().right); int nt = env().getDim().back(); - int N_i = w.size(); - int N_j = v.size(); + int N_i = left.size(); + int N_j = right.size(); int ngamma = gamma_.size(); int nmom = mom_.size(); int block = par().block; int cacheBlock = par().cacheBlock; LOG(Message) << "Computing all-to-all meson fields" << std::endl; - LOG(Message) << "W: '" << par().w << "' V: '" << par().v << "'" << std::endl; + LOG(Message) << "Left: '" << par().left << "' Right: '" << par().right << "'" << std::endl; LOG(Message) << "Momenta:" << std::endl; for (auto &p: mom_) { @@ -309,7 +302,7 @@ void TA2AMesonField::execute(void) auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g) { return par().output + "." + std::to_string(vm().getTrajectory()) - + "/" + ioname(m, g) + ".h5"; + + "/" + ionameFn(m, g) + ".h5"; }; auto metadataFn = [this](const unsigned int m, const unsigned int g) @@ -328,189 +321,7 @@ void TA2AMesonField::execute(void) Kernel kernel(gamma_, ph, envGetGrid(FermionField)); envGetTmp(Computation, computation); - computation.execute(w, v, kernel, ionameFn, filenameFn, metadataFn); - ////////////////////////////////////////////////////////////////////////// - // i,j is first loop over SchurBlock factors reusing 5D matrices - // ii,jj is second loop over cacheBlock factors for high perf contractoin - // iii,jjj are loops within cacheBlock - // Total index is sum of these i+ii+iii etc... - ////////////////////////////////////////////////////////////////////////// - -// double flops; -// double bytes; -// double vol = env().getVolume(); -// double t_kernel = 0.0; -// double nodes = env().getGrid()->NodeCount(); -// double tot_kernel; - -// envGetTmp(Vector, mfBuf); -// envGetTmp(Vector, mfCache); - -// double t0 = usecond(); -// int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0); -// int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0); - -// for(int i=0;i mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj); - -// // Series of cache blocked chunks of the contractions within this block -// flops = 0.0; -// bytes = 0.0; -// for(int ii=0;ii mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj); - -// startTimer("contraction: total"); -// makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, -// env().getNd() - 1, this); -// stopTimer("contraction: total"); - -// // flops for general N_c & N_s -// flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; -// bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj -// + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; - -// startTimer("cache copy"); -// parallel_for_nest5(int m =0;m< nmom;m++) -// for(int g =0;g< ngamma;g++) -// for(int t =0;t< nt;t++) -// for(int iii=0;iii< N_iii;iii++) -// for(int jjj=0;jjj< N_jjj;jjj++) -// { -// mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj); -// } -// stopTimer("cache copy"); -// } - -// // perf -// tot_kernel = getDTimer("contraction: colour trace & mom.") -// + getDTimer("contraction: local space sum"); -// t_kernel = tot_kernel - t_kernel; -// LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes -// << " Gflop/s/node " << std::endl; -// LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes -// << " GB/s/node " << std::endl; -// t_kernel = tot_kernel; - -// // IO -// if (!par().output.empty()) -// { -// double blockSize, ioTime; -// unsigned int myRank = env().getGrid()->ThisRank(), -// nRank = env().getGrid()->RankCount(); - -// LOG(Message) << "Writing block to disk" << std::endl; -// ioTime = -getDTimer("IO: write block"); -// startTimer("IO: total"); -// makeFileDir(filename(0, 0), env().getGrid()); -// #ifdef MF_PARALLEL_IO -// env().getGrid()->Barrier(); -// // make task list for current node -// nodeIo_.clear(); -// for(int f = myRank; f < nmom*ngamma; f += nRank) -// { -// IoHelper h; - -// h.i = i; -// h.j = j; -// h.m = f/ngamma; -// h.g = f % ngamma; -// h.io = A2AMatrixIo(filename(h.m, h.g), -// ioname(h.m, h.g), nt, N_i, N_j); -// for (auto pmu: mom_[h.m]) -// { -// h.md.momentum.push_back(pmu); -// } -// h.md.gamma = gamma_[h.g]; -// nodeIo_.push_back(h); -// } -// // parallel IO -// for (auto &h: nodeIo_) -// { -// saveBlock(mfBlock, h); -// } -// env().getGrid()->Barrier(); -// #else -// // serial IO, for testing purposes only -// for(int m = 0; m < nmom; m++) -// for(int g = 0; g < ngamma; g++) -// { -// IoHelper h; - -// h.i = i; -// h.j = j; -// h.m = m; -// h.g = g; -// h.io = A2AMatrixIo(filename(h.m, h.g), -// ioname(h.m, h.g), nt, N_i, N_j); -// for (auto pmu: mom_[h.m]) -// { -// h.md.momentum.push_back(pmu); -// } -// h.md.gamma = gamma_[h.g]; -// saveBlock(mfBlock, h); -// } -// #endif -// stopTimer("IO: total"); -// blockSize = static_cast(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE)); -// ioTime += getDTimer("IO: write block"); -// LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in " -// << ioTime << " us (" -// << blockSize/ioTime*1.0e6/1024/1024 -// << " MB/s)" << std::endl; -// } -// } -} - -// IO -template -std::string TA2AMesonField::ioname(unsigned int m, unsigned int g) const -{ - std::stringstream ss; - - ss << gamma_[g] << "_"; - for (unsigned int mu = 0; mu < mom_[m].size(); ++mu) - { - ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_"); - } - - return ss.str(); -} - -template -std::string TA2AMesonField::filename(unsigned int m, unsigned int g) const -{ - return par().output + "." + std::to_string(vm().getTrajectory()) - + "/" + ioname(m, g) + ".h5"; -} - -template -void TA2AMesonField::saveBlock(const A2AMatrixSet &mf, IoHelper &h) -{ - if ((h.i == 0) and (h.j == 0)) - { - startTimer("IO: file creation"); - h.io.initFile(h.md, par().block); - stopTimer("IO: file creation"); - } - startTimer("IO: write block"); - h.io.saveBlock(mf, h.m, h.g, h.i, h.j); - stopTimer("IO: write block"); + computation.execute(left, right, kernel, ionameFn, filenameFn, metadataFn); } END_MODULE_NAMESPACE