mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Almost working on Aurora
This commit is contained in:
parent
e637fbacae
commit
02c8178f16
@ -325,12 +325,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// Start comms // Gather intranode and extra node differentiated??
|
// Start comms // Gather intranode and extra node differentiated??
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
std::cout << " WilsonFermion5D gather " <<std::endl;
|
// std::cout << " WilsonFermion5D gather " <<std::endl;
|
||||||
GRID_TRACE("Gather");
|
GRID_TRACE("Gather");
|
||||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
auto id=traceStart("Communicate overlapped");
|
auto id=traceStart("Communicate overlapped");
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
@ -339,7 +339,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
std::cout << " WilsonFermion5D Comms merge " <<std::endl;
|
// std::cout << " WilsonFermion5D Comms merge " <<std::endl;
|
||||||
GRID_TRACE("MergeSHM");
|
GRID_TRACE("MergeSHM");
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
}
|
}
|
||||||
@ -347,7 +347,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
std::cout << " WilsonFermion5D Interior " <<std::endl;
|
// std::cout << " WilsonFermion5D Interior " <<std::endl;
|
||||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagInterior");
|
GRID_TRACE("DhopDagInterior");
|
||||||
@ -360,7 +360,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
traceStop(id);
|
traceStop(id);
|
||||||
|
|
||||||
@ -368,13 +368,13 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
||||||
GRID_TRACE("Merge");
|
GRID_TRACE("Merge");
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagExterior");
|
GRID_TRACE("DhopDagExterior");
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
@ -382,7 +382,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
GRID_TRACE("DhopExterior");
|
GRID_TRACE("DhopExterior");
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
std::cout << " WilsonFermion5D Done " <<std::endl;
|
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -397,13 +397,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
||||||
{
|
{
|
||||||
GRID_TRACE("HaloExchange");
|
GRID_TRACE("HaloExchange");
|
||||||
st.HaloExchangeOpt(in,compressor);
|
st.HaloExchangeOpt(in,compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDag");
|
GRID_TRACE("DhopDag");
|
||||||
@ -412,7 +412,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
GRID_TRACE("Dhop");
|
GRID_TRACE("Dhop");
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
std::cout << " WilsonFermion5D Done " <<std::endl;
|
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -365,8 +365,8 @@ public:
|
|||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
_grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
@ -390,8 +390,8 @@ public:
|
|||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
|
acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
_grid->StencilBarrier();
|
||||||
// run any checksums
|
// run any checksums
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
@ -473,7 +473,7 @@ public:
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
// accelerator_barrier();
|
accelerator_barrier();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -487,6 +487,7 @@ public:
|
|||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
}
|
}
|
||||||
@ -653,7 +654,9 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||||
surface_list.resize(surface_list_size);
|
surface_list.resize(surface_list_size);
|
||||||
|
std::vector<int> surface_list_host(surface_list_size);
|
||||||
int32_t ss=0;
|
int32_t ss=0;
|
||||||
for(int site = 0 ;site< vol4;site++){
|
for(int site = 0 ;site< vol4;site++){
|
||||||
int local = 1;
|
int local = 1;
|
||||||
@ -665,12 +668,12 @@ public:
|
|||||||
if(local == 0) {
|
if(local == 0) {
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int idx=site*Ls+s;
|
int idx=site*Ls+s;
|
||||||
acceleratorPut(surface_list[ss],idx);
|
surface_list_host[ss]= idx;
|
||||||
ss++;
|
ss++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
|
@ -549,8 +549,31 @@ void GridLogLayout() {
|
|||||||
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||||
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
|
// x86 64bit
|
||||||
|
#ifdef __linux__
|
||||||
|
#ifdef __x86_64__
|
||||||
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
fprintf(stderr,"Called backtrace\n");
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
|
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
@ -561,7 +584,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
|
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
||||||
REG(rdi);
|
REG(rdi);
|
||||||
REG(rsi);
|
REG(rsi);
|
||||||
REG(rbp);
|
REG(rbp);
|
||||||
@ -594,8 +617,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
|
|
||||||
void Grid_exit_handler(void)
|
void Grid_exit_handler(void)
|
||||||
{
|
{
|
||||||
BACKTRACEFP(stdout);
|
// BACKTRACEFP(stdout);
|
||||||
fflush(stdout);
|
// fflush(stdout);
|
||||||
}
|
}
|
||||||
void Grid_debug_handler_init(void)
|
void Grid_debug_handler_init(void)
|
||||||
{
|
{
|
||||||
@ -603,10 +626,10 @@ void Grid_debug_handler_init(void)
|
|||||||
sigemptyset (&sa.sa_mask);
|
sigemptyset (&sa.sa_mask);
|
||||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
sigaction(SIGSEGV,&sa,NULL);
|
// sigaction(SIGSEGV,&sa,NULL);
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
sigaction(SIGBUS,&sa,NULL);
|
sigaction(SIGBUS,&sa,NULL);
|
||||||
sigaction(SIGUSR2,&sa,NULL);
|
// sigaction(SIGUSR2,&sa,NULL);
|
||||||
|
|
||||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
|
||||||
@ -614,7 +637,14 @@ void Grid_debug_handler_init(void)
|
|||||||
sigaction(SIGKILL,&sa,NULL);
|
sigaction(SIGKILL,&sa,NULL);
|
||||||
sigaction(SIGILL,&sa,NULL);
|
sigaction(SIGILL,&sa,NULL);
|
||||||
|
|
||||||
atexit(Grid_exit_handler);
|
// Non terminating SIGUSR1/2 handler
|
||||||
|
struct sigaction sa_ping;
|
||||||
|
sigemptyset (&sa_ping.sa_mask);
|
||||||
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||||
|
sa_ping.sa_flags = SA_SIGINFO;
|
||||||
|
sigaction(SIGHUP,&sa_ping,NULL);
|
||||||
|
|
||||||
|
// atexit(Grid_exit_handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -5,63 +5,34 @@
|
|||||||
#PBS -l walltime=00:20:00
|
#PBS -l walltime=00:20:00
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
#export OMP_PROC_BIND=spread
|
|
||||||
#unset OMP_PLACES
|
|
||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
source ../sourceme.sh
|
||||||
module load pti-gpu
|
|
||||||
|
|
||||||
#cat $PBS_NODEFILE
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
export OMP_NUM_THREADS=4
|
export OMP_NUM_THREADS=4
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
|
||||||
|
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
# 12 ppn, 2 nodes, 24 ranks
|
|
||||||
#
|
|
||||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
|
||||||
./gpu_tile_compact.sh \
|
|
||||||
./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \
|
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
|
||||||
$CMD | tee usqcd.log
|
|
||||||
|
|
||||||
|
|
||||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
|
||||||
./gpu_tile_compact.sh \
|
|
||||||
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
|
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
|
||||||
$CMD | tee 1tile.dwf
|
|
||||||
|
|
||||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
|
||||||
#$CMD | tee 1node.32.32.32.48.dwf
|
|
||||||
|
|
||||||
|
#for f in 1 2 3 4 5 6 7 8
|
||||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
for f in 1
|
||||||
./gpu_tile_compact.sh \
|
do
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
|
echo $CMD
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
|
||||||
#$CMD | tee 1node.64.64.32.96.dwf
|
done
|
||||||
|
|
||||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
|
||||||
./gpu_tile_compact.sh \
|
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
|
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
|
||||||
#$CMD | tee 1node.64.32.32.48.dwf
|
|
||||||
|
|
||||||
|
@ -11,17 +11,16 @@
|
|||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
source ../sourceme.sh
|
||||||
module load pti-gpu
|
#module load pti-gpu
|
||||||
|
|
||||||
#cat $PBS_NODEFILE
|
|
||||||
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
export OMP_NUM_THREADS=4
|
export OMP_NUM_THREADS=4
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
@ -34,22 +33,26 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
# 12 ppn, 2 nodes, 24 ranks
|
# 12 ppn, 2 nodes, 24 ranks
|
||||||
#
|
#
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile.sh \
|
||||||
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
$CMD | tee 2node.comms
|
#$CMD | tee 2node.comms.hbm
|
||||||
|
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
|
||||||
$CMD | tee 2node.32.32.64.48.dwf
|
|
||||||
|
|
||||||
|
#for f in 1 2 3 4 5 6 7 8
|
||||||
|
for f in 1
|
||||||
|
do
|
||||||
|
echo $CMD
|
||||||
|
$CMD | tee 2node.32.32.64.48.dwf.hbm.$f
|
||||||
|
done
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
$CMD | tee 2node.64.64.64.96.dwf
|
#$CMD | tee 2node.64.64.64.96.dwf.hbm
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
module load oneapi/release/2023.12.15.001
|
module load oneapi/release/2023.12.15.001
|
||||||
|
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
||||||
|
#module load mpich-config/mode/deterministic
|
||||||
#module load intel_compute_runtime/release/821.35
|
#module load intel_compute_runtime/release/821.35
|
||||||
source ~/spack/share/spack/setup-env.sh
|
source ~/spack/share/spack/setup-env.sh
|
||||||
spack load c-lime
|
spack load c-lime
|
||||||
|
@ -15,13 +15,13 @@
|
|||||||
|
|
||||||
# 56 cores / 6 threads ~9
|
# 56 cores / 6 threads ~9
|
||||||
export OMP_NUM_THREADS=6
|
export OMP_NUM_THREADS=6
|
||||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
|||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
export GRID_PRINT_ENTIRE_LOG=0
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
export GRID_CHECKSUM_RECV_BUF=1
|
export GRID_CHECKSUM_RECV_BUF=0
|
||||||
export GRID_CHECKSUM_SEND_BUF=1
|
export GRID_CHECKSUM_SEND_BUF=0
|
||||||
|
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
DIR=reproBigJob.$PBS_JOBID
|
DIR=reproBigJob.$PBS_JOBID
|
||||||
|
|
||||||
mkdir -p $DIR
|
mkdir -p $DIR
|
||||||
@ -51,10 +53,19 @@ cd $DIR
|
|||||||
|
|
||||||
cp $PBS_NODEFILE nodefile
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
|
BINARY=../Test_dwf_mixedcg_prec
|
||||||
|
|
||||||
|
echo > pingjob <<EOF
|
||||||
|
while read node ;
|
||||||
|
do
|
||||||
|
echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
|
||||||
|
done < nodefile
|
||||||
|
EOF
|
||||||
|
|
||||||
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
|
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
|
||||||
../gpu_tile_compact.sh \
|
../gpu_tile_compact.sh \
|
||||||
../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96 \
|
$BINARY --mpi 4.4.4.6 --grid 128.128.128.96 \
|
||||||
--shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
|
||||||
|
|
||||||
echo $CMD > command-line
|
echo $CMD > command-line
|
||||||
env > environment
|
env > environment
|
||||||
|
Loading…
x
Reference in New Issue
Block a user