mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
6 Commits
66a1b63aa9
...
b7c7000d0d
Author | SHA1 | Date | |
---|---|---|---|
|
b7c7000d0d | ||
|
551f6c4edd | ||
|
defd814750 | ||
|
3d517bbd2a | ||
|
78ab955fec | ||
|
dd13937bb6 |
@ -204,13 +204,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
synchronise();
|
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gemmBatched(int m,int n, int k,
|
void gemmBatched(int m,int n, int k,
|
||||||
@ -279,13 +278,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
synchronise();
|
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@ -357,13 +355,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
synchronise();
|
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -452,13 +449,12 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
synchronise();
|
|
||||||
RealD t1=usecond();
|
RealD t1=usecond();
|
||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
// std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +50,7 @@ public:
|
|||||||
typedef iVector<CComplex,nbasis > Cvec;
|
typedef iVector<CComplex,nbasis > Cvec;
|
||||||
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
||||||
typedef Lattice<Fobj > FineField;
|
typedef Lattice<Fobj > FineField;
|
||||||
|
typedef Lattice<CComplex > FineComplexField;
|
||||||
typedef CoarseVector Field;
|
typedef CoarseVector Field;
|
||||||
////////////////////
|
////////////////////
|
||||||
// Data members
|
// Data members
|
||||||
@ -308,6 +309,7 @@ public:
|
|||||||
RealD teigen=0.0;
|
RealD teigen=0.0;
|
||||||
RealD tmat=0.0;
|
RealD tmat=0.0;
|
||||||
RealD tphase=0.0;
|
RealD tphase=0.0;
|
||||||
|
RealD tphaseBZ=0.0;
|
||||||
RealD tinv=0.0;
|
RealD tinv=0.0;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
@ -362,28 +364,41 @@ public:
|
|||||||
///////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
FineField phaV(grid); // Phased block basis vector
|
FineField phaV(grid); // Phased block basis vector
|
||||||
FineField MphaV(grid);// Matrix applied
|
FineField MphaV(grid);// Matrix applied
|
||||||
|
std::vector<FineComplexField> phaF(npoint,grid);
|
||||||
|
std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
|
||||||
|
|
||||||
CoarseVector coarseInner(CoarseGrid());
|
CoarseVector coarseInner(CoarseGrid());
|
||||||
|
|
||||||
|
typedef typename CComplex::scalar_type SComplex;
|
||||||
|
FineComplexField one(grid); one=SComplex(1.0);
|
||||||
|
FineComplexField zz(grid); zz = Zero();
|
||||||
|
tphase=-usecond();
|
||||||
|
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// Stick a phase on every block
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
CoarseComplexField coor(CoarseGrid());
|
||||||
|
pha[p]=Zero();
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
LatticeCoordinate(coor,mu);
|
||||||
|
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
|
||||||
|
pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
|
||||||
|
}
|
||||||
|
pha[p] =exp(pha[p]*ci);
|
||||||
|
|
||||||
|
blockZAXPY(phaF[p],pha[p],one,zz);
|
||||||
|
|
||||||
|
}
|
||||||
|
tphase+=usecond();
|
||||||
|
|
||||||
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
|
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
|
||||||
std::vector<CoarseVector> FT(npoint,CoarseGrid());
|
std::vector<CoarseVector> FT(npoint,CoarseGrid());
|
||||||
for(int i=0;i<nbasis;i++){// Loop over basis vectors
|
for(int i=0;i<nbasis;i++){// Loop over basis vectors
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||||
/////////////////////////////////////////////////////
|
tphaseBZ-=usecond();
|
||||||
// Stick a phase on every block
|
phaV = phaF[p]*Subspace.subspace[i];
|
||||||
/////////////////////////////////////////////////////
|
tphaseBZ+=usecond();
|
||||||
tphase-=usecond();
|
|
||||||
CoarseComplexField coor(CoarseGrid());
|
|
||||||
CoarseComplexField pha(CoarseGrid()); pha=Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
LatticeCoordinate(coor,mu);
|
|
||||||
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
|
|
||||||
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
|
|
||||||
}
|
|
||||||
pha =exp(pha*ci);
|
|
||||||
phaV=Zero();
|
|
||||||
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
|
|
||||||
tphase+=usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Multiple phased subspace vector by matrix and project to subspace
|
// Multiple phased subspace vector by matrix and project to subspace
|
||||||
@ -394,8 +409,8 @@ public:
|
|||||||
tmat+=usecond();
|
tmat+=usecond();
|
||||||
|
|
||||||
tproj-=usecond();
|
tproj-=usecond();
|
||||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
blockProjectFast(coarseInner,MphaV,Subspace.subspace);
|
||||||
coarseInner = conjugate(pha) * coarseInner;
|
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||||
|
|
||||||
ComputeProj[p] = coarseInner;
|
ComputeProj[p] = coarseInner;
|
||||||
tproj+=usecond();
|
tproj+=usecond();
|
||||||
@ -431,6 +446,7 @@ public:
|
|||||||
ExchangeCoarseLinks();
|
ExchangeCoarseLinks();
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||||
|
@ -368,6 +368,7 @@ public:
|
|||||||
ComplexD(c),
|
ComplexD(c),
|
||||||
BLAS_CP);
|
BLAS_CP);
|
||||||
}
|
}
|
||||||
|
BLAS.synchronise();
|
||||||
t_mult+=usecond();
|
t_mult+=usecond();
|
||||||
// std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
|
// std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
|
||||||
t_BtoG=-usecond();
|
t_BtoG=-usecond();
|
||||||
|
@ -301,6 +301,37 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
|
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
|
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
|
const Lattice<vobj> &fineData,
|
||||||
|
const VLattice &Basis)
|
||||||
|
{
|
||||||
|
GridBase * fine = fineData.Grid();
|
||||||
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
|
Lattice<iScalar<CComplex>> ip(coarse);
|
||||||
|
Lattice<vobj> fineDataRed = fineData;
|
||||||
|
|
||||||
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
|
autoView( ip_ , ip, AcceleratorWrite);
|
||||||
|
RealD t_IP=0;
|
||||||
|
RealD t_co=0;
|
||||||
|
for(int v=0;v<nbasis;v++) {
|
||||||
|
t_IP-=usecond();
|
||||||
|
blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
|
||||||
|
t_IP+=usecond();
|
||||||
|
t_co-=usecond();
|
||||||
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
|
convertType(coarseData_[sc](v),ip_[sc]);
|
||||||
|
});
|
||||||
|
t_co+=usecond();
|
||||||
|
}
|
||||||
|
// std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
|
||||||
|
// std::cout << GridLogPerformance << " blockProjectFast : conv : "<<t_co<<" us"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// This only minimises data motion from CPU to GPU
|
// This only minimises data motion from CPU to GPU
|
||||||
// there is chance of better implementation that does a vxk loop of inner products to data share
|
// there is chance of better implementation that does a vxk loop of inner products to data share
|
||||||
// at the GPU thread level
|
// at the GPU thread level
|
||||||
|
@ -62,6 +62,8 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
|||||||
{
|
{
|
||||||
const int Nsimd=vobj::Nsimd();
|
const int Nsimd=vobj::Nsimd();
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
GridBase *grid = lat.Grid();
|
GridBase *grid = lat.Grid();
|
||||||
Coordinate simd = grid->_simd_layout;
|
Coordinate simd = grid->_simd_layout;
|
||||||
@ -124,8 +126,16 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Transfer into lattice - will coalesce
|
// Transfer into lattice - will coalesce
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
sobj obj = extractLane(blane,buf_p[ss+offset]);
|
// sobj obj = extractLane(blane,buf_p[ss+offset]);
|
||||||
insertLane(lane,lat_v[osite],obj);
|
// insertLane(lane,lat_v[osite],obj);
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
vector_type * from = (vector_type *)&buf_p[ss+offset];
|
||||||
|
vector_type * to = (vector_type *)&lat_v[osite];
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], blane);
|
||||||
|
putlane(to[w], stmp, lane);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -138,6 +148,8 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
|||||||
{
|
{
|
||||||
const int Nsimd=vobj::Nsimd();
|
const int Nsimd=vobj::Nsimd();
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
autoView(lat_v, lat, AcceleratorRead);
|
autoView(lat_v, lat, AcceleratorRead);
|
||||||
|
|
||||||
@ -200,23 +212,18 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Take out of lattice
|
// Take out of lattice
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
// sobj obj = extractLane(lane,lat_v[osite]);
|
||||||
sobj obj = extractLane(lane,lat_v[osite]);
|
// insertLane(blane,buf_p[ss+offset],obj);
|
||||||
insertLane(blane,buf_p[ss+offset],obj);
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
vector_type * to = (vector_type *)&buf_p[ss+offset];
|
||||||
|
vector_type * from = (vector_type *)&lat_v[osite];
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], lane);
|
||||||
|
putlane(to[w], stmp, blane);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
/*
|
|
||||||
int words =block*nblock/simd[dim];
|
|
||||||
std::vector<vobj> tbuf(words);
|
|
||||||
acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
|
|
||||||
typedef typename vobj::scalar_type scalar;
|
|
||||||
scalar *sbuf = (scalar *)&tbuf[0];
|
|
||||||
scalar tmp=0.0;
|
|
||||||
for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
|
|
||||||
tmp=tmp+conjugate(sbuf[w])*sbuf[w];
|
|
||||||
}
|
|
||||||
std::cout << " Gathered buffer norm "<<tmp<<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -545,14 +552,15 @@ public:
|
|||||||
t_scatter+= usecond() - t;
|
t_scatter+= usecond() - t;
|
||||||
t_tot+=usecond();
|
t_tot+=usecond();
|
||||||
|
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||||
std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes :" << depth*bytes/1e6 << "MB"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user