mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
Renable cache blocking and efficient UPI type SHM comms
This commit is contained in:
parent
cad5b187dd
commit
eeb6e0a6e3
@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
_tmp(&FiveDimRedBlackGrid),
|
_tmp(&FiveDimRedBlackGrid),
|
||||||
Dirichlet(0)
|
Dirichlet(0)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
|
|
||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
|
@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
_tmp(&Hgrid),
|
_tmp(&Hgrid),
|
||||||
anisotropyCoeff(anis)
|
anisotropyCoeff(anis)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
if (anisotropyCoeff.isAnisotropic){
|
if (anisotropyCoeff.isAnisotropic){
|
||||||
|
@ -433,7 +433,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
});
|
});
|
||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( ss, Nsite, { \
|
thread_for( sss, Nsite, { \
|
||||||
|
int ss = st.lo->Reorder(sss); \
|
||||||
int sU = ss; \
|
int sU = ss; \
|
||||||
int sF = ss*Ls; \
|
int sF = ss*Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
|
@ -290,9 +290,9 @@ public:
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
LebesgueOrder *lo;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
@ -337,6 +337,7 @@ public:
|
|||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Stencil query
|
// Stencil query
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
|
#ifdef SHM_FAST_PATH
|
||||||
inline int SameNode(int point) {
|
inline int SameNode(int point) {
|
||||||
|
|
||||||
int dimension = this->_directions[point];
|
int dimension = this->_directions[point];
|
||||||
@ -356,7 +357,40 @@ public:
|
|||||||
if ( displacement == 0 ) return 1;
|
if ( displacement == 0 ) return 1;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
//
|
||||||
|
inline int SameNode(int point) {
|
||||||
|
|
||||||
|
int dimension = this->_directions[point];
|
||||||
|
int displacement = this->_distances[point];
|
||||||
|
|
||||||
|
int pd = _grid->_processors[dimension];
|
||||||
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int ld = _grid->_ldimensions[dimension];
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
|
int nbr_proc;
|
||||||
|
if (displacement>0) nbr_proc = 1;
|
||||||
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
|
// FIXME this logic needs to be sorted for three link term
|
||||||
|
// assert( (displacement==1) || (displacement==-1));
|
||||||
|
// Present hack only works for >= 4^4 subvol per node
|
||||||
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
|
||||||
|
|
||||||
|
if ( (shm==NULL) ) return 0;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
// Use OpenMP Tasks for cleaner ???
|
// Use OpenMP Tasks for cleaner ???
|
||||||
@ -1127,11 +1161,32 @@ public:
|
|||||||
recv_buf=this->u_recv_buf_p;
|
recv_buf=this->u_recv_buf_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// potential SHM fast path for intranode
|
||||||
|
int shm_send=0;
|
||||||
|
int shm_recv=0;
|
||||||
|
#ifdef SHM_FAST_PATH
|
||||||
|
// Put directly in place if we can
|
||||||
|
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
|
||||||
|
if ( (send_buf==NULL) ) {
|
||||||
|
shm_send=0;
|
||||||
|
send_buf = this->u_send_buf_p;
|
||||||
|
} else {
|
||||||
|
shm_send=1;
|
||||||
|
}
|
||||||
|
void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
|
||||||
|
if ( test_ptr != NULL ) shm_recv = 1;
|
||||||
|
// static int printed;
|
||||||
|
// if (!printed){
|
||||||
|
// std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
|
||||||
|
// printed = 1;
|
||||||
|
// }
|
||||||
|
#else
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Gather locally
|
// Gather locally
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
send_buf = this->u_send_buf_p; // Gather locally, must send
|
send_buf = this->u_send_buf_p; // Gather locally, must send
|
||||||
assert(send_buf!=NULL);
|
assert(send_buf!=NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||||
@ -1143,10 +1198,13 @@ public:
|
|||||||
// Build a list of things to do after we synchronise GPUs
|
// Build a list of things to do after we synchronise GPUs
|
||||||
// Start comms now???
|
// Start comms now???
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
|
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
|
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||||
|
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, comms_send|comms_partial_send,
|
xmit_to_rank, do_send,
|
||||||
recv_from_rank, comms_recv|comms_partial_recv,
|
recv_from_rank, do_recv,
|
||||||
xbytes,rbytes);
|
xbytes,rbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1288,19 +1346,47 @@ public:
|
|||||||
|
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
int shm_send=0;
|
||||||
|
int shm_recv=0;
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
#ifdef SHM_FAST_PATH
|
||||||
|
#warning STENCIL SHM FAST PATH SELECTED
|
||||||
|
// shm == receive pointer if offnode
|
||||||
|
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||||
|
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||||
|
if (shm==NULL) {
|
||||||
|
shm = rp;
|
||||||
|
// we found a packet that comes from MPI and contributes to this shift.
|
||||||
|
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
|
||||||
|
// Kernel will add the exterior_terms except if is_same_node.
|
||||||
|
// leg of stencil
|
||||||
|
shm_recv=0;
|
||||||
|
} else {
|
||||||
|
shm_recv=1;
|
||||||
|
}
|
||||||
|
rpointers[i] = shm;
|
||||||
|
// Test send side
|
||||||
|
void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
|
||||||
|
if ( test_ptr != NULL ) shm_send = 1;
|
||||||
|
// static int printed;
|
||||||
|
// if (!printed){
|
||||||
|
// std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
|
||||||
|
// printed = 1;
|
||||||
|
// }
|
||||||
|
#else
|
||||||
rpointers[i] = rp;
|
rpointers[i] = rp;
|
||||||
|
#endif
|
||||||
|
|
||||||
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
|
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
|
||||||
if ( !duplicate ) {
|
if ( !duplicate ) {
|
||||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||||
}
|
}
|
||||||
|
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
|
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||||
AddPacket((void *)sp,(void *)rp,
|
AddPacket((void *)sp,(void *)rp,
|
||||||
xmit_to_rank,comms_send|comms_partial_send,
|
xmit_to_rank,do_send,
|
||||||
recv_from_rank,comms_recv|comms_partial_recv,
|
recv_from_rank,do_send,
|
||||||
xbytes,rbytes);
|
xbytes,rbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1310,7 +1396,7 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// rpointer may be doing a remote read in the gather over SHM
|
||||||
if ( comms_recv|comms_partial_recv ) {
|
if ( comms_recv|comms_partial_recv ) {
|
||||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
}
|
}
|
||||||
|
@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in
|
|||||||
;;
|
;;
|
||||||
*) ;;
|
*) ;;
|
||||||
esac
|
esac
|
||||||
|
############### force MPI in SMP
|
||||||
|
AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no])
|
||||||
|
case ${ac_SHM_FAST_PATH} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] )
|
||||||
|
;;
|
||||||
|
*) ;;
|
||||||
|
esac
|
||||||
|
|
||||||
############### communication type selection
|
############### communication type selection
|
||||||
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
|
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user