mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Improved frontier cshift
This commit is contained in:
parent
6d0c2de399
commit
c9c4576237
@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern Vector<std::pair<int,int> > Cshift_table;
|
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||||
|
extern commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
|
|
||||||
|
inline std::pair<int,int> *MapCshiftTable(void)
|
||||||
|
{
|
||||||
|
// GPU version
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
uint64_t sz=Cshift_table.size();
|
||||||
|
if (Cshift_table_device.size()!=sz ) {
|
||||||
|
Cshift_table_device.resize(sz);
|
||||||
|
}
|
||||||
|
acceleratorCopyToDevice((void *)&Cshift_table[0],
|
||||||
|
(void *)&Cshift_table_device[0],
|
||||||
|
sizeof(Cshift_table[0])*sz);
|
||||||
|
|
||||||
|
return &Cshift_table_device[0];
|
||||||
|
#else
|
||||||
|
return &Cshift_table[0];
|
||||||
|
#endif
|
||||||
|
// CPU version use identify map
|
||||||
|
}
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
@ -297,30 +316,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
|
|
||||||
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
|
|
||||||
{
|
|
||||||
int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
|
||||||
if (idx >= e1*e2) return;
|
|
||||||
|
|
||||||
int n, b, o;
|
|
||||||
|
|
||||||
n = idx / e2;
|
|
||||||
b = idx % e2;
|
|
||||||
o = n*stride + b;
|
|
||||||
|
|
||||||
vector[2*idx + 0] = lo + o;
|
|
||||||
vector[2*idx + 1] = ro + o;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -345,20 +340,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
|
||||||
ent = e1*e2;
|
|
||||||
dim3 blockSize(acceleratorThreads());
|
|
||||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
|
||||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
|
||||||
accelerator_barrier();
|
|
||||||
#else
|
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
@ -372,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
auto table = &Cshift_table[0];
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
@ -409,19 +396,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
|
||||||
ent = e1*e2;
|
|
||||||
dim3 blockSize(acceleratorThreads());
|
|
||||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
|
||||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
|
||||||
accelerator_barrier();
|
|
||||||
#else
|
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
@ -432,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
auto table = &Cshift_table[0];
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
|
@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
|
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
|
|
||||||
|
RealD t1,t0;
|
||||||
|
t0=usecond();
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||||
@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||||
Cshift_comms(ret,rhs,dimension,shift);
|
Cshift_comms(ret,rhs,dimension,shift);
|
||||||
}
|
}
|
||||||
|
t1=usecond();
|
||||||
|
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
if (comm_proc==0) {
|
if (comm_proc==0) {
|
||||||
|
tcopy-=usecond();
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
tcopy+=usecond();
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
tgather-=usecond();
|
||||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
grid->Barrier();
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
grid->Barrier();
|
tscatter-=usecond();
|
||||||
|
|
||||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@ -190,6 +210,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
int permute_type=grid->PermuteType(dimension);
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
@ -227,7 +253,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
pointers[i] = &send_buf_extract[i][0];
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
tgather-=usecond();
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
@ -252,7 +280,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
grid->Barrier();
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
@ -262,7 +291,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
xbytes+=bytes;
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
} else {
|
} else {
|
||||||
@ -270,9 +301,17 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
tscatter-=usecond();
|
||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(comm_dim==1);
|
assert(comm_dim==1);
|
||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||||
@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
if (comm_proc==0) {
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
tgather-=usecond();
|
||||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
grid->Barrier();
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
|
tscatter-=usecond();
|
||||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@ -372,6 +432,11 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
assert(simd_layout==2);
|
assert(simd_layout==2);
|
||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
int permute_type=grid->PermuteType(dimension);
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
@ -414,8 +479,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = &send_buf_extract[i][0];
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
|
tgather-=usecond();
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
@ -440,7 +507,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
grid->Barrier();
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
@ -449,17 +517,28 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
} else {
|
} else {
|
||||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
tscatter-=usecond();
|
||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
}
|
tscatter+=usecond();
|
||||||
|
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
Vector<std::pair<int,int> > Cshift_table;
|
std::vector<std::pair<int,int> > Cshift_table;
|
||||||
|
commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user