mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-13 01:05:36 +00:00
Speed up Cshift
This commit is contained in:
parent
8c31c065b5
commit
07c0c02f8c
@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
||||||
#define GRID_ALIGNED_ALLOCATOR_H
|
#define GRID_ALIGNED_ALLOCATOR_H
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
/*Move control to configure.ac and Config.h*/
|
/*Move control to configure.ac and Config.h*/
|
||||||
@ -157,6 +156,15 @@ public:
|
|||||||
|
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
size_type page_size=4096;
|
||||||
|
size_type pages = (bytes+page_size-1)/page_size;
|
||||||
|
uint8_t *bp = (uint8_t *)ptr;
|
||||||
|
|
||||||
|
accelerator_for(pg,pages,1,{
|
||||||
|
bp[pg*page_size]=0;
|
||||||
|
});
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@ public:
|
|||||||
|
|
||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
|
Coordinate _checker_dim_mask;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
|
Coordinate _checker_dim_mask;
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -104,6 +105,7 @@ public:
|
|||||||
_ldimensions.resize(_ndimension);
|
_ldimensions.resize(_ndimension);
|
||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
|
_checker_dim_mask.resize(_ndimension);;
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
@ -114,6 +116,8 @@ public:
|
|||||||
|
|
||||||
for (int d = 0; d < _ndimension; d++)
|
for (int d = 0; d < _ndimension; d++)
|
||||||
{
|
{
|
||||||
|
_checker_dim_mask[d]=0;
|
||||||
|
|
||||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||||
_simd_layout[d] = simd_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
|
@ -35,12 +35,28 @@ static const int CbRed =0;
|
|||||||
static const int CbBlack=1;
|
static const int CbBlack=1;
|
||||||
static const int Even =CbRed;
|
static const int Even =CbRed;
|
||||||
static const int Odd =CbBlack;
|
static const int Odd =CbBlack;
|
||||||
|
|
||||||
|
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
|
||||||
|
{
|
||||||
|
int nd=rdim.size();
|
||||||
|
Coordinate coor(nd);
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndex(coor,oindex,rdim);
|
||||||
|
|
||||||
|
int linear=0;
|
||||||
|
for(int d=0;d<nd;d++){
|
||||||
|
if(chk_dim_msk[d])
|
||||||
|
linear=linear+coor[d];
|
||||||
|
}
|
||||||
|
return (linear&0x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Specialise this for red black grids storing half the data like a chess board.
|
// Specialise this for red black grids storing half the data like a chess board.
|
||||||
class GridRedBlackCartesian : public GridBase
|
class GridRedBlackCartesian : public GridBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
|
@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
extern Vector<std::pair<int,int> > Cshift_table;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -46,7 +48,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int ent = 0;
|
int ent = 0;
|
||||||
|
|
||||||
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
@ -55,7 +58,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int bo = n*e2;
|
int bo = n*e2;
|
||||||
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -65,13 +68,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
thread_for(i,ent,{
|
auto buffer_p = & buffer[0];
|
||||||
buffer[table[i].first]=rhs_v[table[i].second];
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,34 +102,36 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
thread_for_collapse(2,n,e1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
|
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
});
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
|
||||||
// Test_cshift_red_black code.
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
thread_for_collapse(2,n,e1,{
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
for(int b=0;b<e2;b++){
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
int o=n*n1;
|
int o=n*n1;
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,7 +152,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent =0;
|
int ent =0;
|
||||||
|
|
||||||
if ( cbmask ==0x3 ) {
|
if ( cbmask ==0x3 ) {
|
||||||
@ -154,7 +162,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int bo =n*rhs.Grid()->_slice_block[dimension];
|
int bo =n*rhs.Grid()->_slice_block[dimension];
|
||||||
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,15 +173,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
thread_for(i,ent,{
|
auto buffer_p = & buffer[0];
|
||||||
rhs_v[table[i].first]=buffer[table[i].second];
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,13 +205,11 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
thread_for_collapse(2,n,e1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
}
|
});
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
@ -225,6 +233,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
@ -239,14 +248,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
|
||||||
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -255,7 +266,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -263,7 +274,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
thread_for(i,ent,{
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -271,7 +283,6 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||||
{
|
{
|
||||||
|
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
||||||
@ -285,27 +296,29 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int e2=rhs.Grid()->_slice_block [dimension];
|
int e2=rhs.Grid()->_slice_block [dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
} else {
|
} else {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
thread_for(i,ent,{
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,186 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
uint32_t accelerator_threads;
|
uint32_t accelerator_threads=8;
|
||||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||||
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||||
#ifdef GRID_SYCL
|
|
||||||
cl::sycl::queue *theGridAccelerator;
|
#ifdef GRID_CUDA
|
||||||
|
cudaDeviceProp *gpu_props;
|
||||||
|
void acceleratorInit(void)
|
||||||
|
{
|
||||||
|
int nDevices = 1;
|
||||||
|
cudaGetDeviceCount(&nDevices);
|
||||||
|
gpu_props = new cudaDeviceProp[nDevices];
|
||||||
|
|
||||||
|
char * localRankStr = NULL;
|
||||||
|
int rank = 0, world_rank=0;
|
||||||
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
|
// We extract the local rank initialization using an environment variable
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
if ( world_rank == 0) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
prop = gpu_props[i];
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
|
|
||||||
|
GPU_PROP(managedMemory);
|
||||||
|
GPU_PROP(isMultiGpuBoard);
|
||||||
|
GPU_PROP(warpSize);
|
||||||
|
// GPU_PROP(unifiedAddressing);
|
||||||
|
// GPU_PROP(l2CacheSize);
|
||||||
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
|
||||||
|
#else
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
|
||||||
|
cudaSetDevice(rank);
|
||||||
#endif
|
#endif
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipDeviceProp_t *gpu_props;
|
||||||
|
void acceleratorInit(void)
|
||||||
|
{
|
||||||
|
int nDevices = 1;
|
||||||
|
hipGetDeviceCount(&nDevices);
|
||||||
|
gpu_props = new hipDeviceProp_t[nDevices];
|
||||||
|
|
||||||
|
char * localRankStr = NULL;
|
||||||
|
int rank = 0, world_rank=0;
|
||||||
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
|
// We extract the local rank initialization using an environment variable
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
|
hipGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
if ( world_rank == 0) {
|
||||||
|
hipDeviceProp_t prop;
|
||||||
|
prop = gpu_props[i];
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
|
|
||||||
|
// GPU_PROP(managedMemory);
|
||||||
|
GPU_PROP(isMultiGpuBoard);
|
||||||
|
GPU_PROP(warpSize);
|
||||||
|
// GPU_PROP(unifiedAddressing);
|
||||||
|
// GPU_PROP(l2CacheSize);
|
||||||
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
|
||||||
|
#else
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
|
||||||
|
cudaSetDevice(rank);
|
||||||
|
#endif
|
||||||
|
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
|
||||||
|
cl::sycl::queue *theGridAccelerator;
|
||||||
|
|
||||||
|
void acceleratorInit(void)
|
||||||
|
{
|
||||||
|
int nDevices = 1;
|
||||||
|
cl::sycl::gpu_selector selector;
|
||||||
|
cl::sycl::device selectedDevice { selector };
|
||||||
|
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||||
|
|
||||||
|
char * localRankStr = NULL;
|
||||||
|
int rank = 0, world_rank=0;
|
||||||
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
|
// We extract the local rank initialization using an environment variable
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
||||||
|
{
|
||||||
|
rank = atoi(localRankStr);
|
||||||
|
}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
|
if ( world_rank == 0 ) {
|
||||||
|
GridBanner();
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
if ( world_rank == 0) {
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
prop = gpu_props[i];
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
|
printf("GpuInit: ========================\n");
|
||||||
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
if ( world_rank == 0 ) {
|
||||||
|
printf("GpuInit: ================================================\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
|
||||||
|
void acceleratorInit(void){}
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -51,6 +51,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//
|
//
|
||||||
// Warp control and info:
|
// Warp control and info:
|
||||||
//
|
//
|
||||||
|
// acceleratorInit;
|
||||||
// void acceleratorSynchronise(void); // synch warp etc..
|
// void acceleratorSynchronise(void); // synch warp etc..
|
||||||
// int acceleratorSIMTlane(int Nsimd);
|
// int acceleratorSIMTlane(int Nsimd);
|
||||||
//
|
//
|
||||||
@ -69,6 +70,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
uint32_t acceleratorThreads(void);
|
uint32_t acceleratorThreads(void);
|
||||||
void acceleratorThreads(uint32_t);
|
void acceleratorThreads(uint32_t);
|
||||||
|
void acceleratorInit(void);
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// CUDA acceleration
|
// CUDA acceleration
|
||||||
@ -83,6 +85,32 @@ void acceleratorThreads(uint32_t);
|
|||||||
#define accelerator __host__ __device__
|
#define accelerator __host__ __device__
|
||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.x; } // CUDA specific
|
||||||
|
|
||||||
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
|
{ \
|
||||||
|
typedef uint64_t Iterator; \
|
||||||
|
auto lambda = [=] accelerator \
|
||||||
|
(Iterator lane,Iterator iter1,Iterator iter2) mutable { \
|
||||||
|
__VA_ARGS__; \
|
||||||
|
}; \
|
||||||
|
int nt=acceleratorThreads(); \
|
||||||
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
|
dim3 cu_blocks (1,(num1+nt-1)/nt,num2); \
|
||||||
|
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num1,num2,lambda); \
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename lambda> __global__
|
||||||
|
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||||
|
{
|
||||||
|
uint64_t x = threadIdx.x;//+ blockDim.x*blockIdx.x;
|
||||||
|
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
|
||||||
|
uint64_t z = threadIdx.z + blockDim.z*blockIdx.z;
|
||||||
|
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||||
|
Lambda(x,y,z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
cudaDeviceSynchronize(); \
|
cudaDeviceSynchronize(); \
|
||||||
@ -91,25 +119,9 @@ void acceleratorThreads(uint32_t);
|
|||||||
printf("Cuda error %s \n", cudaGetErrorString( err )); \
|
printf("Cuda error %s \n", cudaGetErrorString( err )); \
|
||||||
puts(__FILE__); \
|
puts(__FILE__); \
|
||||||
printf("Line %d\n",__LINE__); \
|
printf("Line %d\n",__LINE__); \
|
||||||
exit(0); \
|
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define accelerator_forNB( iterator, num, nsimd, ... ) \
|
|
||||||
{ \
|
|
||||||
typedef uint64_t Iterator; \
|
|
||||||
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
|
|
||||||
__VA_ARGS__; \
|
|
||||||
}; \
|
|
||||||
dim3 cu_threads(acceleratorThreads(),nsimd); \
|
|
||||||
dim3 cu_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
|
|
||||||
LambdaApply<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
|
||||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
|
||||||
accelerator_barrier(dummy);
|
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -133,15 +145,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
|
|
||||||
template<typename lambda> __global__
|
|
||||||
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
|
||||||
{
|
|
||||||
uint64_t isite = threadIdx.y;
|
|
||||||
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
|
|
||||||
if ( (osite <Osites) && (isite<Isites) ) {
|
|
||||||
Lambda(isite,osite);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -164,25 +167,29 @@ extern cl::sycl::queue *theGridAccelerator;
|
|||||||
#define accelerator
|
#define accelerator
|
||||||
#define accelerator_inline strong_inline
|
#define accelerator_inline strong_inline
|
||||||
|
|
||||||
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[0]; } // SYCL specific
|
||||||
|
|
||||||
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||||
cl::sycl::range<3> local {acceleratorThreads(),1,nsimd}; \
|
int nt=acceleratorThreads(); \
|
||||||
cl::sycl::range<3> global{(unsigned long)num,1,(unsigned long)nsimd}; \
|
unsigned long unum1 = num1; \
|
||||||
|
unsigned long unum2 = num2; \
|
||||||
|
cl::sycl::range<3> local {nsimd,nt,1}; \
|
||||||
|
cl::sycl::range<3> global{nsimd,unum1,unum2}; \
|
||||||
cgh.parallel_for<class dslash>( \
|
cgh.parallel_for<class dslash>( \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
[=] (cl::sycl::nd_item<3> item) mutable { \
|
[=] (cl::sycl::nd_item<3> item) mutable { \
|
||||||
auto iterator = item.get_global_id(0); \
|
auto lane = item.get_global_id(0); \
|
||||||
auto lane = item.get_global_id(2); \
|
auto iter1 = item.get_global_id(1); \
|
||||||
|
auto iter2 = item.get_global_id(2); \
|
||||||
{ __VA_ARGS__ }; \
|
{ __VA_ARGS__ }; \
|
||||||
}); \
|
}); \
|
||||||
});
|
});
|
||||||
|
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||||
|
dim3 cu_blocks (1,(num1+nt-1)/n,num2); \
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
||||||
|
|
||||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
|
||||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
|
||||||
accelerator_barrier(dummy);
|
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
@ -204,33 +211,49 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#define accelerator __host__ __device__
|
#define accelerator __host__ __device__
|
||||||
#define accelerator_inline __host__ __device__ inline
|
#define accelerator_inline __host__ __device__ inline
|
||||||
|
|
||||||
|
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||||
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_x; } // HIP specific
|
||||||
|
|
||||||
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
|
{ \
|
||||||
|
typedef uint64_t Iterator; \
|
||||||
|
auto lambda = [=] accelerator \
|
||||||
|
(Iterator lane,Iterator iter1,Iterator iter2 ) mutable { \
|
||||||
|
{ __VA_ARGS__;} \
|
||||||
|
}; \
|
||||||
|
int nt=acceleratorThreads(); \
|
||||||
|
dim3 hip_threads(nsimd,nt,1); \
|
||||||
|
dim3 hip_blocks (1,(num1+nt-1)/nt,num2); \
|
||||||
|
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||||
|
0,0, \
|
||||||
|
nsimd,num1,num2,lambda); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename lambda> __global__
|
||||||
|
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||||
|
{
|
||||||
|
uint64_t x = hipThreadIdx_x;//+ hipBlockDim_x*hipBlockIdx_x;
|
||||||
|
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
|
||||||
|
uint64_t z = hipThreadIdx_z + hipBlockDim_z*hipBlockIdx_z;
|
||||||
|
if ( (x < numx) && (y<numy) && (z<numz) ) {
|
||||||
|
Lambda(x,y,z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) \
|
#define accelerator_barrier(dummy) \
|
||||||
{ \
|
{ \
|
||||||
hipDeviceSynchronize(); \
|
hipDeviceSynchronize(); \
|
||||||
auto err = hipGetLastError(); \
|
auto err = hipGetLastError(); \
|
||||||
if ( err != hipSuccess ) { \
|
if ( err != hipSuccess ) { \
|
||||||
printf("HIP error %s \n", hipGetErrorString( err )); \
|
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||||
puts(__FILE__); \
|
puts(__FILE__); \
|
||||||
printf("Line %d\n",__LINE__); \
|
printf("Line %d\n",__LINE__); \
|
||||||
exit(0); \
|
exit(0); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define accelerator_forNB( iterator, num, nsimd, ... ) \
|
|
||||||
{ \
|
|
||||||
typedef uint64_t Iterator; \
|
|
||||||
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
|
|
||||||
__VA_ARGS__; \
|
|
||||||
}; \
|
|
||||||
dim3 hip_threads(acceleratorThreads(),nsimd); \
|
|
||||||
dim3 hip_blocks ((num+acceleratorThreads()-1)/acceleratorThreads()); \
|
|
||||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads,0,0,num,nsimd,lambda);\
|
|
||||||
}
|
|
||||||
|
|
||||||
#define accelerator_for( iterator, num, nsimd, ... ) \
|
|
||||||
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
|
|
||||||
accelerator_barrier(dummy);
|
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -241,6 +264,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline void *acceleratorAllocDevice(size_t bytes)
|
inline void *acceleratorAllocDevice(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -251,18 +275,25 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||||
|
|
||||||
template<typename lambda> __global__
|
#endif
|
||||||
void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
|
||||||
{
|
//////////////////////////////////////////////
|
||||||
uint64_t isite = hipThreadIdx_y;
|
// Common on all GPU targets
|
||||||
uint64_t osite = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
|
//////////////////////////////////////////////
|
||||||
if ( (osite <Osites) && (isite<Isites) ) {
|
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
Lambda(isite,osite);
|
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||||
}
|
|
||||||
}
|
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||||
|
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||||
|
accelerator_barrier(dummy);
|
||||||
|
|
||||||
|
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
|
accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
|
||||||
|
accelerator_barrier(dummy);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -280,6 +311,9 @@ void LambdaApply(uint64_t Isites, uint64_t Osites, lambda Lambda)
|
|||||||
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||||
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
|
||||||
#define accelerator_barrier(dummy)
|
#define accelerator_barrier(dummy)
|
||||||
|
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
|
||||||
|
|
||||||
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
#ifdef HAVE_MALLOC_MALLOC_H
|
#ifdef HAVE_MALLOC_MALLOC_H
|
||||||
#include <malloc/malloc.h>
|
#include <malloc/malloc.h>
|
||||||
@ -303,7 +337,6 @@ inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
|||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif // CPU target
|
#endif // CPU target
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
@ -325,25 +358,4 @@ accelerator_inline void acceleratorSynchronise(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
// Address subvectors on accelerators
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_y; } // HIP specific
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -58,6 +58,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||||
|
#define thread_for2d( i1, n1,i2,n2, ... ) \
|
||||||
|
DO_PRAGMA(omp parallel for collapse(2)) \
|
||||||
|
for ( uint64_t i1=0;i1<n1;i1++) { \
|
||||||
|
for ( uint64_t i2=0;i2<n2;i2++) { \
|
||||||
|
{ __VA_ARGS__ } ; \
|
||||||
|
}}
|
||||||
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
|
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
|
||||||
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||||
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||||
|
@ -73,12 +73,6 @@ feenableexcept (unsigned int excepts)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32_t gpu_threads=8;
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
cl::sycl::queue *theGridAccelerator;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -196,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
assert(ompthreads.size()==1);
|
assert(ompthreads.size()==1);
|
||||||
GridThread::SetThreads(ompthreads[0]);
|
GridThread::SetThreads(ompthreads[0]);
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
|
||||||
std::vector<int> gputhreads(0);
|
std::vector<int> gputhreads(0);
|
||||||
#ifndef GRID_CUDA
|
arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
|
||||||
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
|
|
||||||
<< " not compiled with GPU support" << std::endl;
|
|
||||||
#endif
|
|
||||||
arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
|
|
||||||
GridCmdOptionIntVector(arg,gputhreads);
|
GridCmdOptionIntVector(arg,gputhreads);
|
||||||
assert(gputhreads.size()==1);
|
assert(gputhreads.size()==1);
|
||||||
gpu_threads=gputhreads[0];
|
acceleratorThreads(gputhreads[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
||||||
@ -245,8 +235,6 @@ static int Grid_is_initialised;
|
|||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
void GridBanner(void)
|
void GridBanner(void)
|
||||||
{
|
{
|
||||||
static int printed =0;
|
|
||||||
if( !printed ) {
|
|
||||||
std::cout <<std::endl;
|
std::cout <<std::endl;
|
||||||
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
||||||
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
|
||||||
@ -282,125 +270,7 @@ void GridBanner(void)
|
|||||||
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
|
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
|
||||||
#endif
|
#endif
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
printed=1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cudaDeviceProp *gpu_props;
|
|
||||||
void GridGpuInit(void)
|
|
||||||
{
|
|
||||||
int nDevices = 1;
|
|
||||||
cudaGetDeviceCount(&nDevices);
|
|
||||||
gpu_props = new cudaDeviceProp[nDevices];
|
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
|
||||||
int rank = 0, world_rank=0;
|
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
|
||||||
// We extract the local rank initialization using an environment variable
|
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
|
||||||
{
|
|
||||||
rank = atoi(localRankStr);
|
|
||||||
}
|
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
|
||||||
{
|
|
||||||
rank = atoi(localRankStr);
|
|
||||||
}
|
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
|
||||||
|
|
||||||
if ( world_rank == 0 ) {
|
|
||||||
GridBanner();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < nDevices; i++) {
|
|
||||||
|
|
||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
|
||||||
|
|
||||||
cudaGetDeviceProperties(&gpu_props[i], i);
|
|
||||||
if ( world_rank == 0) {
|
|
||||||
cudaDeviceProp prop;
|
|
||||||
prop = gpu_props[i];
|
|
||||||
printf("GpuInit: ========================\n");
|
|
||||||
printf("GpuInit: Device Number : %d\n", i);
|
|
||||||
printf("GpuInit: ========================\n");
|
|
||||||
printf("GpuInit: Device identifier: %s\n", prop.name);
|
|
||||||
|
|
||||||
GPU_PROP(managedMemory);
|
|
||||||
GPU_PROP(isMultiGpuBoard);
|
|
||||||
GPU_PROP(warpSize);
|
|
||||||
// GPU_PROP(unifiedAddressing);
|
|
||||||
// GPU_PROP(l2CacheSize);
|
|
||||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
|
||||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
|
||||||
if ( world_rank == 0 ) printf("GpuInit: IBM Summit or similar - NOT setting device to node rank\n");
|
|
||||||
#else
|
|
||||||
if ( world_rank == 0 ) printf("GpuInit: setting device to node rank\n");
|
|
||||||
cudaSetDevice(rank);
|
|
||||||
#endif
|
|
||||||
if ( world_rank == 0 ) printf("GpuInit: ================================================\n");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
void GridGpuInit(void)
|
|
||||||
{
|
|
||||||
int nDevices = 1;
|
|
||||||
cl::sycl::gpu_selector selector;
|
|
||||||
cl::sycl::device selectedDevice { selector };
|
|
||||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
|
||||||
int rank = 0, world_rank=0;
|
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
|
||||||
// We extract the local rank initialization using an environment variable
|
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
|
||||||
{
|
|
||||||
rank = atoi(localRankStr);
|
|
||||||
}
|
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
|
|
||||||
{
|
|
||||||
rank = atoi(localRankStr);
|
|
||||||
}
|
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
|
||||||
|
|
||||||
if ( world_rank == 0 ) {
|
|
||||||
GridBanner();
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
for (int i = 0; i < nDevices; i++) {
|
|
||||||
|
|
||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
|
||||||
|
|
||||||
cudaGetDeviceProperties(&gpu_props[i], i);
|
|
||||||
if ( world_rank == 0) {
|
|
||||||
cudaDeviceProp prop;
|
|
||||||
prop = gpu_props[i];
|
|
||||||
printf("GpuInit: ========================\n");
|
|
||||||
printf("GpuInit: Device Number : %d\n", i);
|
|
||||||
printf("GpuInit: ========================\n");
|
|
||||||
printf("GpuInit: Device identifier: %s\n", prop.name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
if ( world_rank == 0 ) {
|
|
||||||
printf("GpuInit: ================================================\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))
|
|
||||||
void GridGpuInit(void){}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void Grid_init(int *argc,char ***argv)
|
void Grid_init(int *argc,char ***argv)
|
||||||
{
|
{
|
||||||
@ -414,7 +284,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Early intialisation necessities without rank knowledge
|
// Early intialisation necessities without rank knowledge
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
GridGpuInit(); // Must come first to set device prior to MPI init
|
acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
||||||
int MB;
|
int MB;
|
||||||
@ -483,7 +353,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
|
||||||
MemoryProfiler::debug = true;
|
MemoryProfiler::debug = true;
|
||||||
MemoryProfiler::stats = &dbgMemStats;
|
MemoryProfiler::stats = &dbgMemStats;
|
||||||
|
@ -82,7 +82,7 @@ int main (int argc, char ** argv)
|
|||||||
pickCheckerboard(Odd,Uo,U);
|
pickCheckerboard(Odd,Uo,U);
|
||||||
|
|
||||||
// std::cout<<GridLogMessage << U<<std::endl;
|
// std::cout<<GridLogMessage << U<<std::endl;
|
||||||
std::cout<<GridLogMessage<< U <<std::endl;
|
// std::cout<<GridLogMessage<< U <<std::endl;
|
||||||
std::cout<<GridLogMessage << "U " <<norm2(U)<<std::endl;
|
std::cout<<GridLogMessage << "U " <<norm2(U)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
|
std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
|
std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
|
||||||
|
@ -69,6 +69,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
ShiftU = Cshift(U,dir,shift); // Shift everything
|
ShiftU = Cshift(U,dir,shift); // Shift everything
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Shifted by "<<shift<<" in direction"<<dir<<" checking the AE35 unit" <<std::endl;
|
||||||
/*
|
/*
|
||||||
std::cout << "U[0]" << U[0]<<std::endl;
|
std::cout << "U[0]" << U[0]<<std::endl;
|
||||||
std::cout << "U[1]" << U[1]<<std::endl;
|
std::cout << "U[1]" << U[1]<<std::endl;
|
||||||
|
@ -73,7 +73,7 @@ int main(int argc, char **argv) {
|
|||||||
omp_set_num_threads(omp);
|
omp_set_num_threads(omp);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (int lat = 8; lat <= 16; lat += 40) {
|
for (int lat = 16; lat <= 16; lat += 40) {
|
||||||
std::cout << "Lat " << lat << std::endl;
|
std::cout << "Lat " << lat << std::endl;
|
||||||
|
|
||||||
latt_size[0] = lat;
|
latt_size[0] = lat;
|
||||||
@ -159,15 +159,17 @@ int main(int argc, char **argv) {
|
|||||||
LatticeColourMatrix newFoo = Foo;
|
LatticeColourMatrix newFoo = Foo;
|
||||||
// confirm correctness of copy constructor
|
// confirm correctness of copy constructor
|
||||||
Bar = Foo - newFoo;
|
Bar = Foo - newFoo;
|
||||||
std::cout << "Copy constructor diff check: ";
|
std::cout << "Copy constructor diff check: \n";
|
||||||
double test_cc = norm2(Bar);
|
double test_cc = norm2(Bar);
|
||||||
if (test_cc < 1e-5){
|
if (test_cc < 1e-5){
|
||||||
std::cout << "OK\n";
|
std::cout << "OK\n";
|
||||||
}
|
} else{
|
||||||
else{
|
std::cout << "Foo\n"<<Foo<<std::endl;
|
||||||
|
std::cout << "newFoo\n"<<newFoo<<std::endl;
|
||||||
|
std::cout << "Bar\n"<<Bar<<std::endl;
|
||||||
std::cout << "fail\n";
|
std::cout << "fail\n";
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Norm2 check
|
// Norm2 check
|
||||||
LatticeReal BarReal(&Fine);
|
LatticeReal BarReal(&Fine);
|
||||||
|
Loading…
Reference in New Issue
Block a user