1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Comms and memory benchmarks added

This commit is contained in:
Peter Boyle 2015-05-03 09:44:47 +01:00
parent 99a1ff423d
commit 193860dbc8
14 changed files with 300 additions and 59 deletions

View File

@ -25,17 +25,19 @@ int main (int argc, char ** argv)
for(int lat=4;lat<=16;lat+=4){
for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double start=usecond();
int ncomm=0;
for(int i=0;i<Nloop;i++){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<CartesianCommunicator::CommsRequest_t> requests;
ncomm=0;
@ -68,11 +70,10 @@ int main (int argc, char ** argv)
}
}
Grid.SendToRecvFromComplete(requests);
Grid.Barrier();
}
}
double stop=usecond();
double xbytes = Nloop*bytes*2*ncomm;
@ -96,18 +97,20 @@ int main (int argc, char ** argv)
for(int lat=4;lat<=16;lat+=4){
for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double start=usecond();
int ncomm=0;
for(int i=0;i<Nloop;i++){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
ncomm=0;
for(int mu=0;mu<4;mu++){
@ -131,7 +134,6 @@ int main (int argc, char ** argv)
}
comm_proc = mpi_layout[mu]-1;
{
std::vector<CartesianCommunicator::CommsRequest_t> requests;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);

View File

@ -0,0 +1,150 @@
#include <Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> simd_layout({1,2,2,2});
std::vector<int> mpi_layout ({1,1,1,1});
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
int Nloop=100;
std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking AXPY bandwidth"<<std::endl;
std::cout << "===================================================================================================="<<std::endl;
std::cout << " L "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
for(int lat=4;lat<=32;lat+=4){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
double a=1.0;
double start=usecond();
for(int i=0;i<Nloop;i++){
// z=a*x+y;
// inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
axpy(z,a,x,y);
}
double stop=usecond();
double time = stop-start;
double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
}
std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout << "===================================================================================================="<<std::endl;
std::cout << " L "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
for(int lat=4;lat<=32;lat+=4){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
double a=1.0;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x+y;
}
double stop=usecond();
double time = stop-start;
double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
}
std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking COPY bandwidth"<<std::endl;
std::cout << "===================================================================================================="<<std::endl;
std::cout << " L "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
for(int lat=4;lat<=32;lat+=4){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
RealD a=1.0;
double start=usecond();
for(int i=0;i<Nloop;i++){
x=z;
}
double stop=usecond();
double time = stop-start;
double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
}
std::cout << "===================================================================================================="<<std::endl;
std::cout << "= Benchmarking READ bandwidth"<<std::endl;
std::cout << "===================================================================================================="<<std::endl;
std::cout << " L "<<"\t\t"<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
for(int lat=4;lat<=32;lat+=4){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
RealD a=1.0;
ComplexD nn;
double start=usecond();
for(int i=0;i<Nloop;i++){
nn=norm2(x);
}
double stop=usecond();
double time = stop-start;
double bytes=lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
}
Grid_finalize();
}

View File

@ -14,6 +14,7 @@
#include <complex>
#include <vector>
#include <valarray>
#include <iostream>
#include <cassert>
#include <random>

View File

@ -26,7 +26,8 @@ class Lattice
public:
GridBase *_grid;
int checkerboard;
std::vector<vobj,alignedAllocator<vobj> > _odata;
//std::vector<vobj,alignedAllocator<vobj> > _odata;
std::valarray<vobj> _odata;
public:
typedef typename vobj::scalar_type scalar_type;
@ -36,9 +37,9 @@ public:
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _grid(grid) {
Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
// _odata.reserve(_grid->oSites());
_odata.resize(_grid->oSites());
// _odata.resize(_grid->oSites());
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}

View File

@ -93,7 +93,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);

View File

@ -3,6 +3,9 @@
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
@ -13,25 +16,10 @@ namespace Grid {
}
return ret;
}
template<class vobj>
inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
}
}
template<class vobj>
inline void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3>
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
@ -69,7 +57,89 @@ namespace Grid {
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3>
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mult(&ret._odata[ss],&lhs._odata[ss],&rhs);
}
}
template<class obj1,class obj2,class obj3>
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mac(&ret._odata[ss],&lhs._odata[ss],&rhs);
}
}
template<class obj1,class obj2,class obj3>
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
}
}
template<class obj1,class obj2,class obj3>
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
add(&ret._odata[ss],&lhs._odata[ss],&rhs);
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3>
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
uint32_t vec_len = lhs._grid->oSites();
#pragma omp parallel for
for(int ss=0;ss<vec_len;ss++){
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}
}
template<class obj1,class obj2,class obj3>
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
conformable(lhs,rhs);
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
@ -156,5 +226,17 @@ namespace Grid {
}
return ret;
}
template<class sobj,class vobj>
inline void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
conformable(lhs,rhs);
vobj tmp;
#pragma omp parallel for
for(int ss=0;ss<lhs._grid->oSites();ss++){
tmp = a*lhs._odata[ss];
ret._odata[ss]= tmp+rhs._odata[ss];
}
}
}
#endif

View File

@ -7,6 +7,7 @@ namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////// MAC ///////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////
///////////////////////////
// Legal multiplication table
@ -74,8 +75,6 @@ inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __
}
return;
}
}
#endif

View File

@ -7,7 +7,6 @@ namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////// MUL ///////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class rtype,class vtype,class mtype>
inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){

View File

@ -16,7 +16,7 @@ namespace Grid {
// However note that doing this eliminates some syntactical sugar such as
// calling the constructor explicitly or implicitly
//
#define TENSOR_IS_POD
#undef TENSOR_IS_POD
template<class vtype> class iScalar
{
@ -36,7 +36,7 @@ public:
// template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
#ifndef TENSOR_IS_POD
iScalar(){;};
iScalar()=default;
iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
iScalar(const Zero &z){ *this = zero; };
#endif
@ -126,7 +126,7 @@ public:
#ifndef TENSOR_IS_POD
iVector(const Zero &z){ *this = zero; };
iVector() {};// Empty constructure
iVector() =default;
#endif
iVector<vtype,N> & operator= (const Zero &hero){
@ -189,7 +189,7 @@ public:
#ifndef TENSOR_IS_POD
iMatrix(const Zero &z){ *this = zero; };
iMatrix() {};
iMatrix() =default;
#endif
iMatrix<vtype,N> & operator= (const Zero &hero){

View File

@ -13,7 +13,7 @@ namespace Grid {
vzero(*this);
return (*this);
}
vComplexD(){};
vComplexD()=default;
vComplexD(ComplexD a){
vsplat(*this,a);
};

View File

@ -28,7 +28,7 @@ namespace Grid {
vzero(*this);
return (*this);
}
vComplexF(){};
vComplexF()=default;
vComplexF(ComplexF a){
vsplat(*this,a);
};

View File

@ -10,10 +10,13 @@ namespace Grid {
typedef dvec vector_type;
typedef RealD scalar_type;
vRealD(){};
vRealD()=default;
vRealD(RealD a){
vsplat(*this,a);
};
vRealD(Zero &zero){
zeroit(*this);
}
friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}

View File

@ -8,14 +8,16 @@ namespace Grid {
fvec v;
public:
typedef fvec vector_type;
typedef RealF scalar_type;
vRealF(){};
vRealF()=default;
vRealF(RealF a){
vsplat(*this,a);
};
vRealF(Zero &zero){
zeroit(*this);
}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////

View File

@ -5,11 +5,10 @@ using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
//template<class vobj> class is_pod< iScalar<vobj> >
//{
//
//};
int main (int argc, char ** argv)
{
@ -40,13 +39,16 @@ int main (int argc, char ** argv)
std::cout << " Is pod " << std::is_pod<SpinVector>::value << std::endl;
std::cout << " Is pod double " << std::is_pod<double>::value << std::endl;
std::cout << " Is pod ComplexF " << std::is_pod<ComplexF>::value << std::endl;
std::cout << " Is pod scal<double> " << std::is_pod<scal<double> >::value << std::endl;
std::cout << " Is triv double " << std::is_trivially_default_constructible<double>::value << std::endl;
std::cout << " Is triv ComplexF " << std::is_trivially_default_constructible<ComplexF>::value << std::endl;
std::cout << " Is pod Scalar<double> " << std::is_pod<iScalar<double> >::value << std::endl;
std::cout << " Is pod Scalar<ComplexF> " << std::is_pod<iScalar<ComplexF> >::value << std::endl;
std::cout << " Is pod Scalar<vComplexF> " << std::is_pod<iScalar<vComplexF> >::value << std::endl;
std::cout << " Is pod Scalar<vComplexD> " << std::is_pod<iScalar<vComplexD> >::value << std::endl;
std::cout << " Is pod Scalar<vRealF> " << std::is_pod<iScalar<vRealF> >::value << std::endl;
std::cout << " Is pod Scalar<vRealD> " << std::is_pod<iScalar<vRealD> >::value << std::endl;
std::cout << " Is triv Scalar<double> " <<std::is_trivially_default_constructible<iScalar<double> >::value << std::endl;
std::cout << " Is triv Scalar<vComplexD> "<<std::is_trivially_default_constructible<iScalar<vComplexD> >::value << std::endl;
for(int a=0;a<Ns;a++){
ident()(a,a) = 1.0;