mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Hadrons: A2A block performance counter fix
This commit is contained in:
parent
58567fc650
commit
f6593dc881
@ -148,7 +148,7 @@ A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
|||||||
|
|
||||||
#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
|
#define START_TIMER(name) if (tArray_) tArray_->startTimer(name)
|
||||||
#define STOP_TIMER(name) if (tArray_) tArray_->stopTimer(name)
|
#define STOP_TIMER(name) if (tArray_) tArray_->stopTimer(name)
|
||||||
#define GET_TIMER(name) (tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.
|
#define GET_TIMER(name) ((tArray_ != nullptr) ? tArray_->getDTimer(name) : 0.)
|
||||||
|
|
||||||
template <typename T, typename Field, typename MetadataType, typename TIo>
|
template <typename T, typename Field, typename MetadataType, typename TIo>
|
||||||
void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
||||||
@ -190,11 +190,13 @@ void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
|||||||
A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
|
A2AMatrixSet<TIo> mBlock(mBuf_.data(), next_, nstr_, nt_, N_ii, N_jj);
|
||||||
|
|
||||||
// Series of cache blocked chunks of the contractions within this block
|
// Series of cache blocked chunks of the contractions within this block
|
||||||
flops = 0.0;
|
flops = 0.0;
|
||||||
bytes = 0.0;
|
bytes = 0.0;
|
||||||
|
t_kernel = 0.0;
|
||||||
for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
|
for(int ii=0;ii<N_ii;ii+=cacheBlockSize_)
|
||||||
for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
|
for(int jj=0;jj<N_jj;jj+=cacheBlockSize_)
|
||||||
{
|
{
|
||||||
|
double t;
|
||||||
int N_iii = MIN(N_ii-ii,cacheBlockSize_);
|
int N_iii = MIN(N_ii-ii,cacheBlockSize_);
|
||||||
int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
|
int N_jjj = MIN(N_jj-jj,cacheBlockSize_);
|
||||||
A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
|
A2AMatrixSet<T> mCacheBlock(mCache_.data(), next_, nstr_, nt_, N_iii, N_jjj);
|
||||||
@ -202,9 +204,9 @@ void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
|||||||
// makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph,
|
// makeMesonFieldBlock(mfCacheBlock, &w[i+ii], &v[j+jj], gamma_, ph,
|
||||||
// env().getNd() - 1, this);
|
// env().getNd() - 1, this);
|
||||||
START_TIMER("kernel");
|
START_TIMER("kernel");
|
||||||
kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, tot_kernel);
|
kernel(mCacheBlock, &left[i+ii], &right[j+jj], orthogDim_, t);
|
||||||
STOP_TIMER("kernel");
|
STOP_TIMER("kernel");
|
||||||
|
t_kernel += t;
|
||||||
// flops for general N_c & N_s
|
// flops for general N_c & N_s
|
||||||
// flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
|
// flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj*ngamma;
|
||||||
// bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
|
// bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj
|
||||||
@ -227,12 +229,10 @@ void A2AMatrixBlockComputation<T, Field, MetadataType, TIo>
|
|||||||
// perf
|
// perf
|
||||||
// tot_kernel = getDTimer("contraction: colour trace & mom.")
|
// tot_kernel = getDTimer("contraction: colour trace & mom.")
|
||||||
// + getDTimer("contraction: local space sum");
|
// + getDTimer("contraction: local space sum");
|
||||||
t_kernel = tot_kernel - t_kernel;
|
|
||||||
LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes
|
LOG(Message) << "Kernel perf " << flops/t_kernel/1.0e3/nodes
|
||||||
<< " Gflop/s/node " << std::endl;
|
<< " Gflop/s/node " << std::endl;
|
||||||
LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes
|
LOG(Message) << "Kernel perf " << bytes/t_kernel*1.0e6/1024/1024/1024/nodes
|
||||||
<< " GB/s/node " << std::endl;
|
<< " GB/s/node " << std::endl;
|
||||||
t_kernel = tot_kernel;
|
|
||||||
|
|
||||||
// IO
|
// IO
|
||||||
double blockSize, ioTime;
|
double blockSize, ioTime;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user