diff --git a/benchmarks/Grid_comms.cc b/benchmarks/Grid_comms.cc index 9963a6bf..ef7c5491 100644 --- a/benchmarks/Grid_comms.cc +++ b/benchmarks/Grid_comms.cc @@ -25,17 +25,19 @@ int main (int argc, char ** argv) for(int lat=4;lat<=16;lat+=4){ for(int Ls=1;Ls<=16;Ls*=2){ + std::vector latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); + std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + + int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double start=usecond(); - int ncomm=0; for(int i=0;i latt_size ({lat,lat,lat,lat}); - - GridCartesian Grid(latt_size,simd_layout,mpi_layout); - - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); std::vector requests; ncomm=0; @@ -68,11 +70,10 @@ int main (int argc, char ** argv) } } - Grid.SendToRecvFromComplete(requests); Grid.Barrier(); - } + } double stop=usecond(); double xbytes = Nloop*bytes*2*ncomm; @@ -96,18 +97,20 @@ int main (int argc, char ** argv) for(int lat=4;lat<=16;lat+=4){ for(int Ls=1;Ls<=16;Ls*=2){ + std::vector latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); + std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + + + int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double start=usecond(); - int ncomm=0; for(int i=0;i latt_size ({lat,lat,lat,lat}); - GridCartesian Grid(latt_size,simd_layout,mpi_layout); - - - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); - ncomm=0; for(int mu=0;mu<4;mu++){ @@ -131,7 +134,6 @@ int main (int argc, char ** argv) } comm_proc = mpi_layout[mu]-1; - { std::vector requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); diff --git a/benchmarks/Grid_memory_bandwidth.cc b/benchmarks/Grid_memory_bandwidth.cc new file mode 100644 index 00000000..e4e81445 --- /dev/null +++ b/benchmarks/Grid_memory_bandwidth.cc @@ -0,0 +1,150 @@ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector simd_layout({1,2,2,2}); + std::vector mpi_layout ({1,1,1,1}); + + const int Nvec=8; + typedef Lattice< iVector< vReal,Nvec> > LatticeVec; + + int Nloop=100; + + std::cout << "===================================================================================================="< latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); + + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); + double a=1.0; + + + double start=usecond(); + for(int i=0;i &ret,double a,const Lattice &lhs,const Lattice &rhs){ + axpy(z,a,x,y); + } + double stop=usecond(); + double time = stop-start; + + double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop; + std::cout << lat<<"\t\t"< latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); + + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); + double a=1.0; + + + double start=usecond(); + for(int i=0;i latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); + + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); + RealD a=1.0; + + + double start=usecond(); + for(int i=0;i latt_size ({lat,lat,lat,lat}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice(); + + LatticeVec z(&Grid); random(pRNG,z); + LatticeVec x(&Grid); random(pRNG,x); + LatticeVec y(&Grid); random(pRNG,y); + RealD a=1.0; + ComplexD nn; + + double start=usecond(); + for(int i=0;i #include +#include #include #include #include diff --git a/lib/Grid_lattice.h b/lib/Grid_lattice.h index c76fa0a9..cf1c30c5 100644 --- a/lib/Grid_lattice.h +++ b/lib/Grid_lattice.h @@ -26,7 +26,8 @@ class Lattice public: GridBase *_grid; int checkerboard; - std::vector > _odata; + //std::vector > _odata; + std::valarray _odata; public: typedef typename vobj::scalar_type scalar_type; @@ -36,9 +37,9 @@ public: // Constructor requires "grid" passed. // what about a default grid? ////////////////////////////////////////////////////////////////// - Lattice(GridBase *grid) : _grid(grid) { + Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) { // _odata.reserve(_grid->oSites()); - _odata.resize(_grid->oSites()); + // _odata.resize(_grid->oSites()); assert((((uint64_t)&_odata[0])&0xF) ==0); checkerboard=0; } diff --git a/lib/communicator/Grid_communicator_mpi.cc b/lib/communicator/Grid_communicator_mpi.cc index c76fb76f..6ef05c3d 100644 --- a/lib/communicator/Grid_communicator_mpi.cc +++ b/lib/communicator/Grid_communicator_mpi.cc @@ -93,7 +93,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, MPI_Request rrq; int rank = _processor; int ierr; - ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); assert(ierr==0); diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index 26ed6a29..4d859173 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -3,6 +3,9 @@ namespace Grid { + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // unary negation + ////////////////////////////////////////////////////////////////////////////////////////////////////// template inline Lattice operator -(const Lattice &r) { @@ -13,25 +16,10 @@ namespace Grid { } return ret; } - - template - inline void axpy(Lattice &ret,double a,const Lattice &lhs,const Lattice &rhs){ - conformable(lhs,rhs); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); - } - } - template - inline void axpy(Lattice &ret,std::complex a,const Lattice &lhs,const Lattice &rhs){ - conformable(lhs,rhs); -#pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]); - } - } - - + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // avoid copy back routines for mult, mac, sub, add + ////////////////////////////////////////////////////////////////////////////////////////////////////// template void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); @@ -69,7 +57,89 @@ namespace Grid { } } + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // avoid copy back routines for mult, mac, sub, add + ////////////////////////////////////////////////////////////////////////////////////////////////////// + template + void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + conformable(lhs,rhs); + uint32_t vec_len = lhs._grid->oSites(); +#pragma omp parallel for + for(int ss=0;ss + void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + conformable(lhs,rhs); + uint32_t vec_len = lhs._grid->oSites(); +#pragma omp parallel for + for(int ss=0;ss + void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + conformable(lhs,rhs); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + sub(&ret._odata[ss],&lhs._odata[ss],&rhs); + } + } + template + void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ + conformable(lhs,rhs); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + add(&ret._odata[ss],&lhs._odata[ss],&rhs); + } + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // avoid copy back routines for mult, mac, sub, add + ////////////////////////////////////////////////////////////////////////////////////////////////////// + template + void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + conformable(lhs,rhs); + uint32_t vec_len = lhs._grid->oSites(); +#pragma omp parallel for + for(int ss=0;ss + void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + conformable(lhs,rhs); + uint32_t vec_len = lhs._grid->oSites(); +#pragma omp parallel for + for(int ss=0;ss + void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + conformable(lhs,rhs); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); + } + } + template + void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ + conformable(lhs,rhs); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + add(&ret._odata[ss],&lhs,&rhs._odata[ss]); + } + } + + ///////////////////////////////////////////////////////////////////////////////////// // Lattice BinOp Lattice, + ///////////////////////////////////////////////////////////////////////////////////// template inline auto operator * (const Lattice &lhs,const Lattice &rhs)-> Lattice { @@ -156,5 +226,17 @@ namespace Grid { } return ret; } + + template + inline void axpy(Lattice &ret,sobj a,const Lattice &lhs,const Lattice &rhs){ + conformable(lhs,rhs); + vobj tmp; +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + tmp = a*lhs._odata[ss]; + ret._odata[ss]= tmp+rhs._odata[ss]; + } + } + } #endif diff --git a/lib/math/Grid_math_arith_mac.h b/lib/math/Grid_math_arith_mac.h index 59cb9a5e..6260b98f 100644 --- a/lib/math/Grid_math_arith_mac.h +++ b/lib/math/Grid_math_arith_mac.h @@ -7,6 +7,7 @@ namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// MAC /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// + /////////////////////////// /////////////////////////// // Legal multiplication table @@ -74,8 +75,6 @@ inline void mac(iVector * __restrict__ ret,const iVector * __ } return; } - - } #endif diff --git a/lib/math/Grid_math_arith_mul.h b/lib/math/Grid_math_arith_mul.h index c8bb0b2c..66c8b121 100644 --- a/lib/math/Grid_math_arith_mul.h +++ b/lib/math/Grid_math_arith_mul.h @@ -7,7 +7,6 @@ namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// MUL /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// - template inline void mult(iScalar * __restrict__ ret,const iScalar * __restrict__ lhs,const iScalar * __restrict__ rhs){ diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h index b08d0c83..25da7954 100644 --- a/lib/math/Grid_math_tensors.h +++ b/lib/math/Grid_math_tensors.h @@ -16,7 +16,7 @@ namespace Grid { // However note that doing this eliminates some syntactical sugar such as // calling the constructor explicitly or implicitly // -#define TENSOR_IS_POD +#undef TENSOR_IS_POD template class iScalar { @@ -36,7 +36,7 @@ public: // template using tensor_reduce_level = typename iScalar::tensor_reduce_level >; #ifndef TENSOR_IS_POD - iScalar(){;}; + iScalar()=default; iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type iScalar(const Zero &z){ *this = zero; }; #endif @@ -126,7 +126,7 @@ public: #ifndef TENSOR_IS_POD iVector(const Zero &z){ *this = zero; }; - iVector() {};// Empty constructure + iVector() =default; #endif iVector & operator= (const Zero &hero){ @@ -189,7 +189,7 @@ public: #ifndef TENSOR_IS_POD iMatrix(const Zero &z){ *this = zero; }; - iMatrix() {}; + iMatrix() =default; #endif iMatrix & operator= (const Zero &hero){ diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 337b82c7..208e2640 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -13,7 +13,7 @@ namespace Grid { vzero(*this); return (*this); } - vComplexD(){}; + vComplexD()=default; vComplexD(ComplexD a){ vsplat(*this,a); }; diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index c42e4029..9c7922e3 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -28,7 +28,7 @@ namespace Grid { vzero(*this); return (*this); } - vComplexF(){}; + vComplexF()=default; vComplexF(ComplexF a){ vsplat(*this,a); }; diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index 36e189e1..0bdaa5e4 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -10,10 +10,13 @@ namespace Grid { typedef dvec vector_type; typedef RealD scalar_type; - vRealD(){}; + vRealD()=default; vRealD(RealD a){ vsplat(*this,a); }; + vRealD(Zero &zero){ + zeroit(*this); + } friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h index 70f76bc0..e8d89cec 100644 --- a/lib/simd/Grid_vRealF.h +++ b/lib/simd/Grid_vRealF.h @@ -8,14 +8,16 @@ namespace Grid { fvec v; public: - typedef fvec vector_type; typedef RealF scalar_type; - vRealF(){}; + vRealF()=default; vRealF(RealF a){ vsplat(*this,a); }; + vRealF(Zero &zero){ + zeroit(*this); + } //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc index ab6307f3..af735856 100644 --- a/tests/Grid_gamma.cc +++ b/tests/Grid_gamma.cc @@ -5,11 +5,10 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; -template -struct scal { - d internal; -}; - +//template class is_pod< iScalar > +//{ +// +//}; int main (int argc, char ** argv) { @@ -40,13 +39,16 @@ int main (int argc, char ** argv) std::cout << " Is pod " << std::is_pod::value << std::endl; std::cout << " Is pod double " << std::is_pod::value << std::endl; std::cout << " Is pod ComplexF " << std::is_pod::value << std::endl; - std::cout << " Is pod scal " << std::is_pod >::value << std::endl; + std::cout << " Is triv double " << std::is_trivially_default_constructible::value << std::endl; + std::cout << " Is triv ComplexF " << std::is_trivially_default_constructible::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; std::cout << " Is pod Scalar " << std::is_pod >::value << std::endl; + std::cout << " Is triv Scalar " < >::value << std::endl; + std::cout << " Is triv Scalar "< >::value << std::endl; for(int a=0;a