1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Hadrons: big update abstracting the block meson field routine, tested & working, performance counters broken and code dirty

This commit is contained in:
Antonin Portelli 2018-10-04 20:01:49 +01:00
parent d0b21bf1ff
commit 58567fc650
10 changed files with 844 additions and 283 deletions

View File

@ -7,6 +7,7 @@ Source file: Hadrons/A2AMatrix.hpp
Copyright (C) 2015-2018 Copyright (C) 2015-2018
Author: Antonin Portelli <antonin.portelli@me.com> Author: Antonin Portelli <antonin.portelli@me.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -29,10 +30,28 @@ See the full license in the file "LICENSE" in the top level distribution directo
#define A2A_Matrix_hpp_ #define A2A_Matrix_hpp_
#include <Hadrons/Global.hpp> #include <Hadrons/Global.hpp>
#include <Hadrons/TimerArray.hpp>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
#ifndef HADRONS_A2AM_NAME
#define HADRONS_A2AM_NAME "a2aMatrix"
#endif
#define HADRONS_A2AM_PARALLEL_IO
BEGIN_HADRONS_NAMESPACE BEGIN_HADRONS_NAMESPACE
template <typename T, typename MetadataType> // general A2A matrix set class based on Eigen tensors and Grid-allocated memory
// Dimensions:
// 0 - ext - external field (momentum, EM field, ...)
// 1 - str - spin-color structure
// 2 - t - timeslice
// 3 - i - left A2A mode index
// 4 - j - right A2A mode index
template <typename T>
using A2AMatrixSet = Eigen::TensorMap<Eigen::Tensor<T, 5, Eigen::RowMajor>>;
template <typename T>
class A2AMatrixIo class A2AMatrixIo
{ {
public: public:
@ -41,26 +60,265 @@ public:
const unsigned int nt, const unsigned int ni, const unsigned int nt, const unsigned int ni,
const unsigned int nj); const unsigned int nj);
~A2AMatrixIo(void) = default; ~A2AMatrixIo(void) = default;
template <typename MetadataType>
void initFile(const MetadataType &d, const unsigned int chunkSize); void initFile(const MetadataType &d, const unsigned int chunkSize);
void saveBlock(const T *data, const unsigned int i, const unsigned int j, void saveBlock(const T *data, const unsigned int i, const unsigned int j,
const unsigned int blockSizei, const unsigned int blockSizej); const unsigned int blockSizei, const unsigned int blockSizej);
void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
const unsigned int i, const unsigned int j);
private: private:
std::string filename_, dataname_; std::string filename_, dataname_;
unsigned int nt_, ni_, nj_; unsigned int nt_, ni_, nj_;
}; };
template <typename T, typename MetadataType> template <typename T, typename Field>
A2AMatrixIo<T, MetadataType>::A2AMatrixIo(std::string filename, class A2AKernel
std::string dataname, {
const unsigned int nt, public:
const unsigned int ni, A2AKernel(void) = default;
const unsigned int nj) virtual ~A2AKernel(void) = default;
virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
const unsigned int orthogDim, double &time) = 0;
virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej) = 0;
};
template <typename T, typename Field, typename MetadataType, typename TIo = T>
class A2AMatrixBlockComputation
{
private:
struct IoHelper
{
A2AMatrixIo<TIo> io;
MetadataType md;
unsigned int e, s, i, j;
};
typedef std::function<std::string(const unsigned int, const unsigned int)> FilenameFn;
typedef std::function<MetadataType(const unsigned int, const unsigned int)> MetadataFn;
public:
A2AMatrixBlockComputation(GridBase *grid,
const unsigned int orthogDim,
const unsigned int next,
const unsigned int nstr,
const unsigned int blockSize,
const unsigned int cacheBlockSize,
TimerArray *tArray = nullptr);
void execute(const std::vector<Field> &left,
const std::vector<Field> &right,
A2AKernel<T, Field> &kernel,
const FilenameFn &ionameFn,
const FilenameFn &filenameFn,
const MetadataFn &metadataFn);
private:
void saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h);
private:
TimerArray *tArray_;
GridBase *grid_;
unsigned int orthogDim_, nt_, next_, nstr_, blockSize_, cacheBlockSize_;
Vector<T> mCache_;
Vector<TIo> mBuf_;
std::vector<IoHelper> nodeIo_;
};
template <typename T, typename Field, typename MetadataType, typename TIo>
A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
::A2AMatrixBlockComputation(GridBase *grid,
const unsigned int orthogDim,
const unsigned int next,
const unsigned int nstr,
const unsigned int blockSize,
const unsigned int cacheBlockSize,
TimerArray *tArray)
: grid_(grid), nt_(grid->GlobalDimensions()[orthogDim]), orthogDim_(orthogDim)
, next_(next), nstr_(nstr), blockSize_(blockSize), cacheBlockSize_(cacheBlockSize)
, tArray_(tArray)
{
std::cout << nt_ << std::endl;
std::cout << next_ << std::endl;
std::cout << nstr_ << std::endl;
std::cout << cacheBlockSize_ << std::endl;
std::cout << blockSize_ << std::endl;
mCache_.resize(nt_*next_*nstr_*cacheBlockSize_*cacheBlockSize_);
mBuf_.resize(nt_*next_*nstr_*blockSize_*blockSize_);
std::cout << mCache_.size() << std::endl;
std::cout << mBuf_.size() << std::endl;
std::cout << blockSize << std::endl;
std::cout << cacheBlockSize << std::endl;
}
#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
#define STOP_TIMER(name) if (tArray_) tArray_->stopTimer(name)
#define GET_TIMER(name) (tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.
template <typename T, typename Field, typename MetadataType, typename TIo>
void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
::execute(const std::vector<Field> &left, const std::vector<Field> &right,
A2AKernel<T, Field> &kernel, const FilenameFn &ionameFn,
const FilenameFn &filenameFn, const MetadataFn &metadataFn)
{
//////////////////////////////////////////////////////////////////////////
// i,j is first loop over SchurBlock factors reusing 5D matrices
// ii,jj is second loop over cacheBlock factors for high perf contractoin
// iii,jjj are loops within cacheBlock
// Total index is sum of these i+ii+iii etc...
//////////////////////////////////////////////////////////////////////////
int N_i = left.size();
int N_j = right.size();
double flops;
double bytes;
double t_kernel = 0.;
double nodes = grid_->NodeCount();
double tot_kernel;
double t0 = usecond();
int NBlock_i = N_i/blockSize_ + (((N_i % blockSize_) != 0) ? 1 : 0);
int NBlock_j = N_j/blockSize_ + (((N_j % blockSize_) != 0) ? 1 : 0);
for(int i=0;i<N_i;i+=blockSize_)
for(int j=0;j<N_j;j+=blockSize_)
{
// Get the W and V vectors for this block^2 set of terms
int N_ii = MIN(N_i-i,blockSize_);
int N_jj = MIN(N_j-j,blockSize_);
LOG(Message) << "All-to-all matrix block "
<< j/blockSize_ + NBlock_j*i/blockSize_ + 1
<< "/" << NBlock_i*NBlock_j << " [" << i <<" .. "
<< i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]"
<< std::endl;
A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
// Series of cache blocked chunks of the contractions within this block
flops = 0.0;
bytes = 0.0;
for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
{
int N_iii = MIN(N_ii-ii,cacheBlockSize_);
int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
// makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph,
// env().getNd() - 1, this);
START_TIMER("kernel");
kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, tot_kernel);
STOP_TIMER("kernel");
// flops for general N_c & N_s
// flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
// bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
// + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
flops += kernel.flops(N_iii, N_jjj);
bytes += kernel.bytes(N_iii, N_jjj);
START_TIMER("cache copy");
parallel_for_nest5(int e =0;e<next_;e++)
for(int s =0;s< nstr_;s++)
for(int t =0;t< nt_;t++)
for(int iii=0;iii< N_iii;iii++)
for(int jjj=0;jjj< N_jjj;jjj++)
{
mBlock(e,s,t,ii+iii,jj+jjj) = mCacheBlock(e,s,t,iii,jjj);
}
STOP_TIMER("cache copy");
}
// perf
// tot_kernel = getDTimer("contraction: colour trace & mom.")
// + getDTimer("contraction: local space sum");
t_kernel = tot_kernel - t_kernel;
LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes
<< " Gflop/s/node " << std::endl;
LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes
<< " GB/s/node " << std::endl;
t_kernel = tot_kernel;
// IO
double blockSize, ioTime;
unsigned int myRank = grid_->ThisRank(),
nRank = grid_->RankCount();
LOG(Message) << "Writing block to disk" << std::endl;
ioTime = -GET_TIMER("IO: write block");
START_TIMER("IO: total");
makeFileDir(filenameFn(0, 0), grid_);
#ifdef HADRONS_A2AM_PARALLEL_IO
grid_->Barrier();
// make task list for current node
nodeIo_.clear();
for(int f = myRank; f < next_*nstr_; f += nRank)
{
IoHelper h;
h.i = i;
h.j = j;
h.e = f/nstr_;
h.s = f % nstr_;
h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s),
ionameFn(h.e, h.s), nt_, N_i, N_j);
h.md = metadataFn(h.e, h.s);
nodeIo_.push_back(h);
}
// parallel IO
for (auto &h: nodeIo_)
{
saveBlock(mBlock, h);
}
grid_->Barrier();
#else
// serial IO, for testing purposes only
for(int e = 0; e < next_; e++)
for(int s = 0; s < nstr_; s++)
{
IoHelper h;
h.i = i;
h.j = j;
h.e = e;
h.s = s;
h.io = A2AMatrixIo<TIo>(filenameFn(h.e, h.s),
ionameFn(h.e, h.s), nt_, N_i, N_j);
h.md = metadataFn(h.e, h.s);
saveBlock(mfBlock, h);
}
#endif
STOP_TIMER("IO: total");
blockSize = static_cast<double>(next_*nstr_*nt_*N_ii*N_jj*sizeof(TIo));
ioTime += GET_TIMER("IO: write block");
LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
<< ioTime << " us ("
<< blockSize/ioTime*1.0e6/1024/1024
<< " MB/s)" << std::endl;
}
}
template <typename T, typename Field, typename MetadataType, typename TIo>
void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
::saveBlock(const A2AMatrixSet<TIo> &m, IoHelper &h)
{
if ((h.i == 0) and (h.j == 0))
{
START_TIMER("IO: file creation");
h.io.initFile(h.md, blockSize_);
STOP_TIMER("IO: file creation");
}
START_TIMER("IO: write block");
h.io.saveBlock(m, h.e, h.s, h.i, h.j);
STOP_TIMER("IO: write block");
}
template <typename T>
A2AMatrixIo<T>::A2AMatrixIo(std::string filename, std::string dataname,
const unsigned int nt, const unsigned int ni,
const unsigned int nj)
: filename_(filename), dataname_(dataname) : filename_(filename), dataname_(dataname)
, nt_(nt), ni_(ni), nj_(nj) , nt_(nt), ni_(ni), nj_(nj)
{} {}
template <typename T, typename MetadataType> template <typename T>
void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigned int chunkSize) template <typename MetadataType>
void A2AMatrixIo<T>::initFile(const MetadataType &d, const unsigned int chunkSize)
{ {
#ifdef HAVE_HDF5 #ifdef HAVE_HDF5
std::vector<hsize_t> dim = {static_cast<hsize_t>(nt_), std::vector<hsize_t> dim = {static_cast<hsize_t>(nt_),
@ -85,18 +343,18 @@ void A2AMatrixIo<T, MetadataType>::initFile(const MetadataType &d, const unsigne
push(reader, dataname_); push(reader, dataname_);
auto &group = reader.getGroup(); auto &group = reader.getGroup();
plist.setChunk(chunk.size(), chunk.data()); plist.setChunk(chunk.size(), chunk.data());
dataset = group.createDataSet("data", Hdf5Type<T>::type(), dataspace, plist); dataset = group.createDataSet(HADRONS_A2AM_NAME, Hdf5Type<T>::type(), dataspace, plist);
#else #else
HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library"); HADRONS_ERROR(Implementation, "all-to-all matrix I/O needs HDF5 library");
#endif #endif
} }
template <typename T, typename MetadataType> template <typename T>
void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data, void A2AMatrixIo<T>::saveBlock(const T *data,
const unsigned int i, const unsigned int i,
const unsigned int j, const unsigned int j,
const unsigned int blockSizei, const unsigned int blockSizei,
const unsigned int blockSizej) const unsigned int blockSizej)
{ {
#ifdef HAVE_HDF5 #ifdef HAVE_HDF5
Hdf5Reader reader(filename_); Hdf5Reader reader(filename_);
@ -111,7 +369,7 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
push(reader, dataname_); push(reader, dataname_);
auto &group = reader.getGroup(); auto &group = reader.getGroup();
dataset = group.openDataSet("data"); dataset = group.openDataSet(HADRONS_A2AM_NAME);
dataspace = dataset.getSpace(); dataspace = dataset.getSpace();
dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(), dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
stride.data(), block.data()); stride.data(), block.data());
@ -121,6 +379,21 @@ void A2AMatrixIo<T, MetadataType>::saveBlock(const T *data,
#endif #endif
} }
template <typename T>
void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,
const unsigned int ext, const unsigned int str,
const unsigned int i, const unsigned int j)
{
unsigned int blockSizei = m.dimension(3);
unsigned int blockSizej = m.dimension(4);
unsigned int nstr = m.dimension(1);
size_t offset = (ext*nstr + str)*nt_*blockSizei*blockSizej;
saveBlock(m.data() + offset, i, j, blockSizei, blockSizej);
}
END_HADRONS_NAMESPACE END_HADRONS_NAMESPACE
#endif // A2A_Matrix_hpp_ #endif // A2A_Matrix_hpp_

View File

@ -11,6 +11,7 @@ libHadrons_a_SOURCES = \
Exceptions.cc \ Exceptions.cc \
Global.cc \ Global.cc \
Module.cc \ Module.cc \
TimerArray.cc \
VirtualMachine.cc VirtualMachine.cc
libHadrons_adir = $(includedir)/Hadrons libHadrons_adir = $(includedir)/Hadrons
nobase_libHadrons_a_HEADERS = \ nobase_libHadrons_a_HEADERS = \
@ -31,4 +32,5 @@ nobase_libHadrons_a_HEADERS = \
Modules.hpp \ Modules.hpp \
ModuleFactory.hpp \ ModuleFactory.hpp \
Solver.hpp \ Solver.hpp \
TimerArray.hpp \
VirtualMachine.hpp VirtualMachine.hpp

View File

@ -66,101 +66,6 @@ void ModuleBase::operator()(void)
stopAllTimers(); stopAllTimers();
} }
// timers //////////////////////////////////////////////////////////////////////
void ModuleBase::startTimer(const std::string &name)
{
if (!name.empty())
{
timer_[name].Start();
}
}
GridTime ModuleBase::getTimer(const std::string &name)
{
GridTime t;
if (!name.empty())
{
try
{
bool running = timer_.at(name).isRunning();
if (running) stopTimer(name);
t = timer_.at(name).Elapsed();
if (running) startTimer(name);
}
catch (std::out_of_range &)
{
t = GridTime::zero();
}
}
else
{
t = GridTime::zero();
}
return t;
}
double ModuleBase::getDTimer(const std::string &name)
{
return static_cast<double>(getTimer(name).count());
}
void ModuleBase::startCurrentTimer(const std::string &name)
{
if (!name.empty())
{
stopCurrentTimer();
startTimer(name);
currentTimer_ = name;
}
}
void ModuleBase::stopTimer(const std::string &name)
{
if (timer_.at(name).isRunning())
{
timer_.at(name).Stop();
}
}
void ModuleBase::stopCurrentTimer(void)
{
if (!currentTimer_.empty())
{
stopTimer(currentTimer_);
currentTimer_ = "";
}
}
void ModuleBase::stopAllTimers(void)
{
for (auto &t: timer_)
{
stopTimer(t.first);
}
currentTimer_ = "";
}
void ModuleBase::resetTimers(void)
{
timer_.clear();
currentTimer_ = "";
}
std::map<std::string, GridTime> ModuleBase::getTimings(void)
{
std::map<std::string, GridTime> timing;
for (auto &t: timer_)
{
timing[t.first] = t.second.Elapsed();
}
return timing;
}
std::string ModuleBase::makeSeedString(void) std::string ModuleBase::makeSeedString(void)
{ {
std::string seed; std::string seed;

View File

@ -30,6 +30,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#define Hadrons_Module_hpp_ #define Hadrons_Module_hpp_
#include <Hadrons/Global.hpp> #include <Hadrons/Global.hpp>
#include <Hadrons/TimerArray.hpp>
#include <Hadrons/VirtualMachine.hpp> #include <Hadrons/VirtualMachine.hpp>
BEGIN_HADRONS_NAMESPACE BEGIN_HADRONS_NAMESPACE
@ -152,7 +153,7 @@ if (env().getGrid()->IsBoss() and !ioStem.empty())\
* Module class * * Module class *
******************************************************************************/ ******************************************************************************/
// base class // base class
class ModuleBase class ModuleBase: public TimerArray
{ {
public: public:
// constructor // constructor
@ -180,16 +181,6 @@ public:
virtual void execute(void) = 0; virtual void execute(void) = 0;
// execution // execution
void operator()(void); void operator()(void);
// timers
void startTimer(const std::string &name);
GridTime getTimer(const std::string &name);
double getDTimer(const std::string &name);
void startCurrentTimer(const std::string &name);
void stopTimer(const std::string &name);
void stopCurrentTimer(void);
void stopAllTimers(void);
void resetTimers(void);
std::map<std::string, GridTime> getTimings(void);
protected: protected:
// environment shortcut // environment shortcut
DEFINE_ENV_ALIAS; DEFINE_ENV_ALIAS;

View File

@ -1,6 +1,6 @@
#include <Hadrons/Modules/MContraction/Baryon.hpp> #include <Hadrons/Modules/MContraction/Baryon.hpp>
#include <Hadrons/Modules/MContraction/A2AMesonField.hpp> #include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp> #include <Hadrons/Modules/MContraction/A2AKernels.hpp>
#include <Hadrons/Modules/MContraction/Meson.hpp> #include <Hadrons/Modules/MContraction/Meson.hpp>
#include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp> #include <Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
#include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp> #include <Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>

View File

@ -31,7 +31,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Hadrons/Global.hpp> #include <Hadrons/Global.hpp>
#include <Hadrons/Module.hpp> #include <Hadrons/Module.hpp>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
BEGIN_HADRONS_NAMESPACE BEGIN_HADRONS_NAMESPACE
@ -45,10 +44,10 @@ template <typename Field, typename MesonField>
void makeMesonFieldBlock(MesonField &mat, void makeMesonFieldBlock(MesonField &mat,
const Field *lhs_wi, const Field *lhs_wi,
const Field *rhs_vj, const Field *rhs_vj,
std::vector<Gamma::Algebra> gamma, const std::vector<Gamma::Algebra> &gamma,
const std::vector<LatticeComplex> &mom, const std::vector<LatticeComplex> &mom,
int orthogdim, int orthogdim,
ModuleBase *caller = nullptr) double &time)
{ {
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -58,6 +57,8 @@ void makeMesonFieldBlock(MesonField &mat,
typedef iSpinMatrix<vector_type> SpinMatrix_v; typedef iSpinMatrix<vector_type> SpinMatrix_v;
typedef iSpinMatrix<scalar_type> SpinMatrix_s; typedef iSpinMatrix<scalar_type> SpinMatrix_s;
TimerArray tArray;
int Lblock = mat.dimension(3); int Lblock = mat.dimension(3);
int Rblock = mat.dimension(4); int Rblock = mat.dimension(4);
@ -96,7 +97,7 @@ void makeMesonFieldBlock(MesonField &mat,
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
if (caller) caller->startTimer("contraction: colour trace & mom."); tArray.startTimer("contraction: colour trace & mom.");
// Nested parallelism would be ok // Nested parallelism would be ok
// Wasting cores here. Test case r // Wasting cores here. Test case r
parallel_for(int r=0;r<rd;r++) parallel_for(int r=0;r<rd;r++)
@ -138,10 +139,10 @@ void makeMesonFieldBlock(MesonField &mat,
} }
} }
} }
if (caller) caller->stopTimer("contraction: colour trace & mom."); tArray.stopTimer("contraction: colour trace & mom.");
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
if (caller) caller->startTimer("contraction: local space sum"); tArray.startTimer("contraction: local space sum");
parallel_for(int rt=0;rt<rd;rt++) parallel_for(int rt=0;rt<rd;rt++)
{ {
std::vector<int> icoor(Nd); std::vector<int> icoor(Nd);
@ -166,10 +167,12 @@ void makeMesonFieldBlock(MesonField &mat,
} }
} }
} }
if (caller) caller->stopTimer("contraction: local space sum"); tArray.stopTimer("contraction: local space sum");
time = tArray.getDTimer("contraction: colour trace & mom.")
+ tArray.getDTimer("contraction: local space sum");
// ld loop and local only?? // ld loop and local only??
if (caller) caller->startTimer("contraction: spin trace"); tArray.startTimer("contraction: spin trace");
int pd = grid->_processors[orthogdim]; int pd = grid->_processors[orthogdim];
int pc = grid->_processor_coor[orthogdim]; int pc = grid->_processor_coor[orthogdim];
parallel_for_nest2(int lt=0;lt<ld;lt++) parallel_for_nest2(int lt=0;lt<ld;lt++)
@ -206,14 +209,198 @@ void makeMesonFieldBlock(MesonField &mat,
} }
} }
} }
if (caller) caller->stopTimer("contraction: spin trace"); tArray.stopTimer("contraction: spin trace");
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// This global sum is taking as much as 50% of time on 16 nodes // This global sum is taking as much as 50% of time on 16 nodes
// Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
// Healthy size that should suffice // Healthy size that should suffice
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
if (caller) caller->startTimer("contraction: global sum"); tArray.startTimer("contraction: global sum");
grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
tArray.stopTimer("contraction: global sum");
}
template <typename Field, typename AslashField>
void makeAslashFieldBlock(AslashField &mat,
const Field *lhs_wi,
const Field *rhs_vj,
const std::vector<LatticeComplex> &emB0,
const std::vector<LatticeComplex> &emB1,
int orthogdim,
ModuleBase *caller = nullptr)
{
typedef typename Field::vector_object vobj;
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef iSpinMatrix<vector_type> SpinMatrix_v;
typedef iSpinMatrix<scalar_type> SpinMatrix_s;
int Lblock = mat.dimension(2);
int Rblock = mat.dimension(3);
GridBase *grid = lhs_wi[0]._grid;
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
int Nt = grid->GlobalDimensions()[orthogdim];
int Nem = emB0.size();
assert(emB1.size() == Nem);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// will locally sum vectors first
// sum across these down to scalars
// splitting the SIMD
int MFrvol = rd*Lblock*Rblock*Nem;
int MFlvol = ld*Lblock*Rblock*Nem;
Vector<vector_type> lvSum(MFrvol);
parallel_for (int r = 0; r < MFrvol; r++)
{
lvSum[r] = zero;
}
Vector<scalar_type> lsSum(MFlvol);
parallel_for (int r = 0; r < MFlvol; r++)
{
lsSum[r] = scalar_type(0.0);
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
if (caller) caller->startTimer("contraction: colour trace & Aslash mul");
// Nested parallelism would be ok
// Wasting cores here. Test case r
parallel_for(int r=0;r<rd;r++)
{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++)
for(int b=0;b<e2;b++)
{
int ss= so+n*stride+b;
for(int i=0;i<Lblock;i++)
{
auto left = conjugate(lhs_wi[i]._odata[ss]);
for(int j=0;j<Rblock;j++)
{
SpinMatrix_v vv;
auto right = rhs_vj[j]._odata[ss];
for(int s1=0;s1<Ns;s1++)
for(int s2=0;s2<Ns;s2++)
{
vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+ left()(s2)(1) * right()(s1)(1)
+ left()(s2)(2) * right()(s1)(2);
}
// After getting the sitewise product do the mom phase loop
int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
for ( int m=0;m<Nem;m++)
{
int idx = m+base;
auto b0 = emB0[m]._odata[ss];
auto b1 = emB1[m]._odata[ss];
auto cb0 = conjugate(b0);
auto cb1 = conjugate(b1);
// B_0 = A_1 + i A_0
// B_1 = A_3 + i A_2
//
// then in spin space
//
// ( 0 0 B_1 -conj(B_0) )
// A_mu g_mu = ( 0 0 B_0 conj(B_1) )
// ( conj(B_1) conj(B_0) 0 0 )
// ( -B_0 B_1 0 0 )
lvSum[idx] += vv()(0,2)()*b1 - vv()(0,3)()*cb0
+ vv()(1,2)()*b0 + vv()(1,3)()*cb1
+ vv()(2,0)()*cb1 + vv()(2,1)()*cb0
- vv()(3,0)()*b0 + vv()(3,1)()*b1;
}
}
}
}
}
if (caller) caller->stopTimer("contraction: colour trace & Aslash mul");
// Sum across simd lanes in the plane, breaking out orthog dir.
if (caller) caller->startTimer("contraction: local space sum");
parallel_for(int rt=0;rt<rd;rt++)
{
std::vector<int> icoor(Nd);
std::vector<scalar_type> extracted(Nsimd);
for(int i=0;i<Lblock;i++)
for(int j=0;j<Rblock;j++)
for(int m=0;m<Nem;m++)
{
int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
extract(lvSum[ij_rdx],extracted);
for(int idx=0;idx<Nsimd;idx++)
{
grid->iCoorFromIindex(icoor,idx);
int ldx = rt+icoor[orthogdim]*rd;
int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
}
}
}
if (caller) caller->stopTimer("contraction: local space sum");
// ld loop and local only??
if (caller) caller->startTimer("contraction: tensor store");
int pd = grid->_processors[orthogdim];
int pc = grid->_processor_coor[orthogdim];
parallel_for_nest2(int lt=0;lt<ld;lt++)
{
for(int pt=0;pt<pd;pt++)
{
int t = lt + pt*ld;
if (pt == pc)
{
for(int i=0;i<Lblock;i++)
for(int j=0;j<Rblock;j++)
for(int m=0;m<Nem;m++)
{
int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
mat(m,t,i,j) = lsSum[ij_dx];
}
}
else
{
const scalar_type zz(0.0);
for(int i=0;i<Lblock;i++)
for(int j=0;j<Rblock;j++)
for(int m=0;m<Nem;m++)
{
mat(m,t,i,j) = zz;
}
}
}
}
if (caller) caller->stopTimer("contraction: tensor store");
if (caller) caller->startTimer("contraction: global sum");
grid->GlobalSumVector(&mat(0,0,0,0),Nem*Nt*Lblock*Rblock);
if (caller) caller->stopTimer("contraction: global sum"); if (caller) caller->stopTimer("contraction: global sum");
} }

View File

@ -36,7 +36,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Hadrons/A2AVectors.hpp> #include <Hadrons/A2AVectors.hpp>
#include <Hadrons/A2AMatrix.hpp> #include <Hadrons/A2AMatrix.hpp>
#include <Hadrons/Modules/MSolver/A2AVectors.hpp> #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
#include <Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp> #include <Hadrons/Modules/MContraction/A2AKernels.hpp>
#define MF_PARALLEL_IO #define MF_PARALLEL_IO
#ifndef MF_IO_TYPE #ifndef MF_IO_TYPE
@ -71,21 +71,62 @@ public:
Gamma::Algebra, gamma); Gamma::Algebra, gamma);
}; };
template <typename T, typename Field>
class MesonFieldKernel: public A2AKernel<T, Field>
{
public:
MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
const std::vector<LatticeComplex> &mom,
GridBase *grid)
: gamma_(gamma), mom_(mom), grid_(grid)
{
vol_ = 1.;
for (auto &d: grid_->GlobalDimensions())
{
vol_ *= d;
}
}
virtual ~MesonFieldKernel(void) = default;
virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
const unsigned int orthogDim, double &time)
{
makeMesonFieldBlock(m, left, right, gamma_, mom_, orthogDim, time);
}
virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
{
return vol_*(2*8.0+6.0+8.0*mom_.size())*blockSizei*blockSizej*gamma_.size();
}
virtual double bytes(const unsigned int blockSizei, const unsigned int blockSizej)
{
return vol_*(12.0*sizeof(T))*blockSizei*blockSizej
+ vol_*(2.0*sizeof(T)*mom_.size())*blockSizei*blockSizej*gamma_.size();
}
private:
const std::vector<Gamma::Algebra> &gamma_;
const std::vector<LatticeComplex> &mom_;
GridBase *grid_;
double vol_;
};
template <typename FImpl> template <typename FImpl>
class TA2AMesonField : public Module<A2AMesonFieldPar> class TA2AMesonField : public Module<A2AMesonFieldPar>
{ {
public: public:
FERM_TYPE_ALIASES(FImpl,); FERM_TYPE_ALIASES(FImpl,);
SOLVER_TYPE_ALIASES(FImpl,); SOLVER_TYPE_ALIASES(FImpl,);
typedef Eigen::TensorMap<Eigen::Tensor<Complex, 5, Eigen::RowMajor>> MesonField; typedef A2AMatrixBlockComputation<Complex,
typedef Eigen::TensorMap<Eigen::Tensor<MF_IO_TYPE, 5, Eigen::RowMajor>> MesonFieldIo; FermionField,
typedef A2AMatrixIo<MF_IO_TYPE, A2AMesonFieldMetadata> MatrixIo; A2AMesonFieldMetadata,
MF_IO_TYPE> Computation;
typedef MesonFieldKernel<Complex, FermionField> Kernel;
struct IoHelper struct IoHelper
{ {
MatrixIo io; A2AMatrixIo<MF_IO_TYPE> io;
A2AMesonFieldMetadata metadata; A2AMesonFieldMetadata md;
size_t offset; unsigned int m, g, i, j;
unsigned int i, j, blockSizei, blockSizej;
}; };
public: public:
// constructor // constructor
@ -103,13 +144,13 @@ private:
// IO // IO
std::string ioname(const unsigned int m, const unsigned int g) const; std::string ioname(const unsigned int m, const unsigned int g) const;
std::string filename(const unsigned int m, const unsigned int g) const; std::string filename(const unsigned int m, const unsigned int g) const;
void saveBlock(const MF_IO_TYPE *data, IoHelper &h); void saveBlock(const A2AMatrixSet<MF_IO_TYPE> &mf, IoHelper &h);
private: private:
bool hasPhase_{false}; bool hasPhase_{false};
std::string momphName_; std::string momphName_;
std::vector<Gamma::Algebra> gamma_; std::vector<Gamma::Algebra> gamma_;
std::vector<std::vector<Real>> mom_; std::vector<std::vector<Real>> mom_;
std::vector<IoHelper> nodeIo_; std::vector<IoHelper> nodeIo_;
}; };
MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction); MODULE_REGISTER(A2AMesonField, ARG(TA2AMesonField<FIMPL>), MContraction);
@ -190,11 +231,9 @@ void TA2AMesonField<FImpl>::setup(void)
envCache(std::vector<ComplexField>, momphName_, 1, envCache(std::vector<ComplexField>, momphName_, 1,
par().mom.size(), envGetGrid(ComplexField)); par().mom.size(), envGetGrid(ComplexField));
envTmpLat(ComplexField, "coor"); envTmpLat(ComplexField, "coor");
// preallocate memory for meson field block envTmp(Computation, "computation", 1, envGetGrid(FermionField),
auto tgp = env().getDim().back()*gamma_.size()*mom_.size(); env().getNd() - 1, mom_.size(), gamma_.size(), par().block,
par().cacheBlock, this);
envTmp(Vector<MF_IO_TYPE>, "mfBuf", 1, tgp*par().block*par().block);
envTmp(Vector<Complex>, "mfCache", 1, tgp*par().cacheBlock*par().cacheBlock);
} }
// execution /////////////////////////////////////////////////////////////////// // execution ///////////////////////////////////////////////////////////////////
@ -253,7 +292,43 @@ void TA2AMesonField<FImpl>::execute(void)
hasPhase_ = true; hasPhase_ = true;
stopTimer("Momentum phases"); stopTimer("Momentum phases");
} }
auto ionameFn = [this](const unsigned int m, const unsigned int g)
{
std::stringstream ss;
ss << gamma_[g] << "_";
for (unsigned int mu = 0; mu < mom_[m].size(); ++mu)
{
ss << mom_[m][mu] << ((mu == mom_[m].size() - 1) ? "" : "_");
}
return ss.str();
};
auto filenameFn = [this, &ionameFn](const unsigned int m, const unsigned int g)
{
return par().output + "." + std::to_string(vm().getTrajectory())
+ "/" + ioname(m, g) + ".h5";
};
auto metadataFn = [this](const unsigned int m, const unsigned int g)
{
A2AMesonFieldMetadata md;
for (auto pmu: mom_[m])
{
md.momentum.push_back(pmu);
}
md.gamma = gamma_[g];
return md;
};
Kernel kernel(gamma_, ph, envGetGrid(FermionField));
envGetTmp(Computation, computation);
computation.execute(w, v, kernel, ionameFn, filenameFn, metadataFn);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// i,j is first loop over SchurBlock factors reusing 5D matrices // i,j is first loop over SchurBlock factors reusing 5D matrices
// ii,jj is second loop over cacheBlock factors for high perf contractoin // ii,jj is second loop over cacheBlock factors for high perf contractoin
@ -261,145 +336,145 @@ void TA2AMesonField<FImpl>::execute(void)
// Total index is sum of these i+ii+iii etc... // Total index is sum of these i+ii+iii etc...
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
double flops; // double flops;
double bytes; // double bytes;
double vol = env().getVolume(); // double vol = env().getVolume();
double t_kernel = 0.0; // double t_kernel = 0.0;
double nodes = env().getGrid()->NodeCount(); // double nodes = env().getGrid()->NodeCount();
double tot_kernel; // double tot_kernel;
envGetTmp(Vector<MF_IO_TYPE>, mfBuf); // envGetTmp(Vector<MF_IO_TYPE>, mfBuf);
envGetTmp(Vector<Complex>, mfCache); // envGetTmp(Vector<Complex>, mfCache);
double t0 = usecond(); // double t0 = usecond();
int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0); // int NBlock_i = N_i/block + (((N_i % block) != 0) ? 1 : 0);
int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0); // int NBlock_j = N_j/block + (((N_j % block) != 0) ? 1 : 0);
for(int i=0;i<N_i;i+=block) // for(int i=0;i<N_i;i+=block)
for(int j=0;j<N_j;j+=block) // for(int j=0;j<N_j;j+=block)
{ // {
// Get the W and V vectors for this block^2 set of terms // // Get the W and V vectors for this block^2 set of terms
int N_ii = MIN(N_i-i,block); // int N_ii = MIN(N_i-i,block);
int N_jj = MIN(N_j-j,block); // int N_jj = MIN(N_j-j,block);
LOG(Message) << "Meson field block " // LOG(Message) << "Meson field block "
<< j/block + NBlock_j*i/block + 1 // << j/block + NBlock_j*i/block + 1
<< "/" << NBlock_i*NBlock_j << " [" << i <<" .. " // << "/" << NBlock_i*NBlock_j << " [" << i <<" .. "
<< i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]" // << i+N_ii-1 << ", " << j <<" .. " << j+N_jj-1 << "]"
<< std::endl; // << std::endl;
MesonFieldIo mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj); // A2AMatrixSet<MF_IO_TYPE> mfBlock(mfBuf.data(),nmom,ngamma,nt,N_ii,N_jj);
// Series of cache blocked chunks of the contractions within this block // // Series of cache blocked chunks of the contractions within this block
flops = 0.0; // flops = 0.0;
bytes = 0.0; // bytes = 0.0;
for(int ii=0;ii<N_ii;ii+=cacheBlock) // for(int ii=0;ii<N_ii;ii+=cacheBlock)
for(int jj=0;jj<N_jj;jj+=cacheBlock) // for(int jj=0;jj<N_jj;jj+=cacheBlock)
{ // {
int N_iii = MIN(N_ii-ii,cacheBlock); // int N_iii = MIN(N_ii-ii,cacheBlock);
int N_jjj = MIN(N_jj-jj,cacheBlock); // int N_jjj = MIN(N_jj-jj,cacheBlock);
MesonField mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj); // A2AMatrixSet<Complex> mfCacheBlock(mfCache.data(),nmom,ngamma,nt,N_iii,N_jjj);
startTimer("contraction: total"); // startTimer("contraction: total");
makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph, // makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph,
env().getNd() - 1, this); // env().getNd() - 1, this);
stopTimer("contraction: total"); // stopTimer("contraction: total");
// flops for general N_c & N_s // // flops for general N_c & N_s
flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma; // flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj // bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
+ vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma; // + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj* ngamma;
startTimer("cache copy"); // startTimer("cache copy");
parallel_for_nest5(int m =0;m< nmom;m++) // parallel_for_nest5(int m =0;m< nmom;m++)
for(int g =0;g< ngamma;g++) // for(int g =0;g< ngamma;g++)
for(int t =0;t< nt;t++) // for(int t =0;t< nt;t++)
for(int iii=0;iii< N_iii;iii++) // for(int iii=0;iii< N_iii;iii++)
for(int jjj=0;jjj< N_jjj;jjj++) // for(int jjj=0;jjj< N_jjj;jjj++)
{ // {
mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj); // mfBlock(m,g,t,ii+iii,jj+jjj) = mfCacheBlock(m,g,t,iii,jjj);
} // }
stopTimer("cache copy"); // stopTimer("cache copy");
} // }
// perf // // perf
tot_kernel = getDTimer("contraction: colour trace & mom.") // tot_kernel = getDTimer("contraction: colour trace & mom.")
+ getDTimer("contraction: local space sum"); // + getDTimer("contraction: local space sum");
t_kernel = tot_kernel - t_kernel; // t_kernel = tot_kernel - t_kernel;
LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes // LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes
<< " Gflop/s/node " << std::endl; // << " Gflop/s/node " << std::endl;
LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes // LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes
<< " GB/s/node " << std::endl; // << " GB/s/node " << std::endl;
t_kernel = tot_kernel; // t_kernel = tot_kernel;
// IO // // IO
if (!par().output.empty()) // if (!par().output.empty())
{ // {
double blockSize, ioTime; // double blockSize, ioTime;
unsigned int myRank = env().getGrid()->ThisRank(), // unsigned int myRank = env().getGrid()->ThisRank(),
nRank = env().getGrid()->RankCount(); // nRank = env().getGrid()->RankCount();
LOG(Message) << "Writing block to disk" << std::endl; // LOG(Message) << "Writing block to disk" << std::endl;
ioTime = -getDTimer("IO: write block"); // ioTime = -getDTimer("IO: write block");
startTimer("IO: total"); // startTimer("IO: total");
makeFileDir(filename(0, 0), env().getGrid()); // makeFileDir(filename(0, 0), env().getGrid());
#ifdef MF_PARALLEL_IO // #ifdef MF_PARALLEL_IO
env().getGrid()->Barrier(); // env().getGrid()->Barrier();
nodeIo_.clear(); // // make task list for current node
for(int f = myRank; f < nmom*ngamma; f += nRank) // nodeIo_.clear();
{ // for(int f = myRank; f < nmom*ngamma; f += nRank)
const unsigned int m = f/ngamma, g = f % ngamma; // {
IoHelper h; // IoHelper h;
h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j); // h.i = i;
for (auto pmu: mom_[m]) // h.j = j;
{ // h.m = f/ngamma;
h.metadata.momentum.push_back(pmu); // h.g = f % ngamma;
} // h.io = A2AMatrixIo<MF_IO_TYPE>(filename(h.m, h.g),
h.metadata.gamma = gamma_[g]; // ioname(h.m, h.g), nt, N_i, N_j);
h.i = i; // for (auto pmu: mom_[h.m])
h.j = j; // {
h.blockSizei = mfBlock.dimension(3); // h.md.momentum.push_back(pmu);
h.blockSizej = mfBlock.dimension(4); // }
h.offset = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej; // h.md.gamma = gamma_[h.g];
nodeIo_.push_back(h); // nodeIo_.push_back(h);
} // }
// parallel IO // // parallel IO
for (auto &h: nodeIo_) // for (auto &h: nodeIo_)
{ // {
saveBlock(mfBlock.data(), h); // saveBlock(mfBlock, h);
} // }
env().getGrid()->Barrier(); // env().getGrid()->Barrier();
#else // #else
// serial IO // // serial IO, for testing purposes only
for(int m = 0; m < nmom; m++) // for(int m = 0; m < nmom; m++)
for(int g = 0; g < ngamma; g++) // for(int g = 0; g < ngamma; g++)
{ // {
IoHelper h; // IoHelper h;
h.io = MatrixIo(filename(m, g), ioname(m, g), nt, N_i, N_j); // h.i = i;
for (auto pmu: mom_[m]) // h.j = j;
{ // h.m = m;
h.metadata.momentum.push_back(pmu); // h.g = g;
} // h.io = A2AMatrixIo<MF_IO_TYPE>(filename(h.m, h.g),
h.metadata.gamma = gamma_[g]; // ioname(h.m, h.g), nt, N_i, N_j);
h.i = i; // for (auto pmu: mom_[h.m])
h.j = j; // {
h.blockSizei = mfBlock.dimension(3); // h.md.momentum.push_back(pmu);
h.blockSizej = mfBlock.dimension(4); // }
h.offset = (m*ngamma + g)*nt*h.blockSizei*h.blockSizej; // h.md.gamma = gamma_[h.g];
saveBlock(mfBlock.data(), h); // saveBlock(mfBlock, h);
} // }
#endif // #endif
stopTimer("IO: total"); // stopTimer("IO: total");
blockSize = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE)); // blockSize = static_cast<double>(nmom*ngamma*nt*N_ii*N_jj*sizeof(MF_IO_TYPE));
ioTime += getDTimer("IO: write block"); // ioTime += getDTimer("IO: write block");
LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in " // LOG(Message) << "HDF5 IO done " << sizeString(blockSize) << " in "
<< ioTime << " us (" // << ioTime << " us ("
<< blockSize/ioTime*1.0e6/1024/1024 // << blockSize/ioTime*1.0e6/1024/1024
<< " MB/s)" << std::endl; // << " MB/s)" << std::endl;
} // }
} // }
} }
// IO // IO
@ -425,16 +500,16 @@ std::string TA2AMesonField<FImpl>::filename(unsigned int m, unsigned int g) cons
} }
template <typename FImpl> template <typename FImpl>
void TA2AMesonField<FImpl>::saveBlock(const MF_IO_TYPE *data, IoHelper &h) void TA2AMesonField<FImpl>::saveBlock(const A2AMatrixSet<MF_IO_TYPE> &mf, IoHelper &h)
{ {
if ((h.i == 0) and (h.j == 0)) if ((h.i == 0) and (h.j == 0))
{ {
startTimer("IO: file creation"); startTimer("IO: file creation");
h.io.initFile(h.metadata, par().block); h.io.initFile(h.md, par().block);
stopTimer("IO: file creation"); stopTimer("IO: file creation");
} }
startTimer("IO: write block"); startTimer("IO: write block");
h.io.saveBlock(data + h.offset, h.i, h.j, h.blockSizei, h.blockSizej); h.io.saveBlock(mf, h.m, h.g, h.i, h.j);
stopTimer("IO: write block"); stopTimer("IO: write block");
} }

99
Hadrons/TimerArray.cc Normal file
View File

@ -0,0 +1,99 @@
#include <Hadrons/TimerArray.hpp>
using namespace Grid;
using namespace QCD;
using namespace Hadrons;
void TimerArray::startTimer(const std::string &name)
{
if (!name.empty())
{
timer_[name].Start();
}
}
GridTime TimerArray::getTimer(const std::string &name)
{
GridTime t;
if (!name.empty())
{
try
{
bool running = timer_.at(name).isRunning();
if (running) stopTimer(name);
t = timer_.at(name).Elapsed();
if (running) startTimer(name);
}
catch (std::out_of_range &)
{
t = GridTime::zero();
}
}
else
{
t = GridTime::zero();
}
return t;
}
double TimerArray::getDTimer(const std::string &name)
{
return static_cast<double>(getTimer(name).count());
}
void TimerArray::startCurrentTimer(const std::string &name)
{
if (!name.empty())
{
stopCurrentTimer();
startTimer(name);
currentTimer_ = name;
}
}
void TimerArray::stopTimer(const std::string &name)
{
if (timer_.at(name).isRunning())
{
timer_.at(name).Stop();
}
}
void TimerArray::stopCurrentTimer(void)
{
if (!currentTimer_.empty())
{
stopTimer(currentTimer_);
currentTimer_ = "";
}
}
void TimerArray::stopAllTimers(void)
{
for (auto &t: timer_)
{
stopTimer(t.first);
}
currentTimer_ = "";
}
void TimerArray::resetTimers(void)
{
timer_.clear();
currentTimer_ = "";
}
std::map<std::string, GridTime> TimerArray::getTimings(void)
{
std::map<std::string, GridTime> timing;
for (auto &t: timer_)
{
timing[t.first] = t.second.Elapsed();
}
return timing;
}

29
Hadrons/TimerArray.hpp Normal file
View File

@ -0,0 +1,29 @@
#ifndef Hadrons_TimerArray_hpp_
#define Hadrons_TimerArray_hpp_
#include <Hadrons/Global.hpp>
BEGIN_HADRONS_NAMESPACE
class TimerArray
{
public:
TimerArray(void) = default;
virtual ~TimerArray(void) = default;
void startTimer(const std::string &name);
GridTime getTimer(const std::string &name);
double getDTimer(const std::string &name);
void startCurrentTimer(const std::string &name);
void stopTimer(const std::string &name);
void stopCurrentTimer(void);
void stopAllTimers(void);
void resetTimers(void);
std::map<std::string, GridTime> getTimings(void);
private:
std::string currentTimer_;
std::map<std::string, GridStopWatch> timer_;
};
END_HADRONS_NAMESPACE
#endif // Hadrons_TimerArray_hpp_

View File

@ -64,7 +64,7 @@ modules_cc =\
modules_hpp =\ modules_hpp =\
Modules/MContraction/Baryon.hpp \ Modules/MContraction/Baryon.hpp \
Modules/MContraction/A2AMesonField.hpp \ Modules/MContraction/A2AMesonField.hpp \
Modules/MContraction/A2AMesonFieldKernels.hpp \ Modules/MContraction/A2AKernels.hpp \
Modules/MContraction/Meson.hpp \ Modules/MContraction/Meson.hpp \
Modules/MContraction/WeakHamiltonian.hpp \ Modules/MContraction/WeakHamiltonian.hpp \
Modules/MContraction/WeakHamiltonianNonEye.hpp \ Modules/MContraction/WeakHamiltonianNonEye.hpp \