mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
15 Commits
da81a73b4a
...
7f9d06f339
Author | SHA1 | Date | |
---|---|---|---|
|
7f9d06f339 | ||
|
ccf147d6c1 | ||
|
7aa12b446f | ||
|
c293228102 | ||
|
5c4c9f721a | ||
|
057f86c1de | ||
|
cd52e3cbc2 | ||
|
24602e1259 | ||
|
8a098889fc | ||
|
ff2ea5de18 | ||
|
461cd045c6 | ||
|
fee65d7a75 | ||
|
31f9971dbf | ||
|
d87296f3e8 | ||
|
be94cf1c6f |
@ -1,5 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <type_traits>
|
|
||||||
#if defined(GRID_CUDA)
|
#if defined(GRID_CUDA)
|
||||||
|
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
|
||||||
|
#if defined(GRID_SYCL)
|
||||||
|
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
|
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
|
||||||
|
vobj vobj_zero;
|
||||||
|
zeroit(vobj_zero);
|
||||||
|
for (int r = 0; r<rd; r++) {
|
||||||
|
mysum[r] = vobj_zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
|
// autoView(Data_v, Data, AcceleratorRead);
|
||||||
|
|
||||||
|
//prepare reduction buffer
|
||||||
|
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
||||||
|
|
||||||
|
int n = s / e2;
|
||||||
|
int b = s % e2;
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
|
||||||
|
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||||
|
Reduction,
|
||||||
|
[=](cl::sycl::id<1> item, auto &sum) {
|
||||||
|
auto s = item[0];
|
||||||
|
sum += rb_p[r*subvol_size+s];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
lvSum[r] = mysum[r];
|
||||||
|
}
|
||||||
|
free(mysum,*theGridAccelerator);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
buf[ss] = dat[ss*words+w];
|
buf[ss] = dat[ss*words+w];
|
||||||
});
|
});
|
||||||
|
|
||||||
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
|
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#elif defined(GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
}
|
}
|
||||||
@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
|
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#elif defined (GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
|
||||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
size_t subvol_size = e1*e2;
|
|
||||||
|
|
||||||
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
|
|
||||||
vobj vobj_zero;
|
|
||||||
zeroit(vobj_zero);
|
|
||||||
|
|
||||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
|
||||||
|
|
||||||
autoView(Data_v, Data, AcceleratorRead);
|
|
||||||
|
|
||||||
//prepare reduction buffer
|
|
||||||
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
|
||||||
|
|
||||||
int n = s / e2;
|
|
||||||
int b = s % e2;
|
|
||||||
int so=r*ostride; // base offset for start of plane
|
|
||||||
int ss= so+n*stride+b;
|
|
||||||
|
|
||||||
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
|
||||||
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
|
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
|
||||||
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
|
|
||||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
|
||||||
Reduction,
|
|
||||||
[=](cl::sycl::id<1> item, auto &sum) {
|
|
||||||
auto s = item[0];
|
|
||||||
sum += rb_p[r*subvol_size+s];
|
|
||||||
});
|
|
||||||
});
|
|
||||||
theGridAccelerator->wait();
|
|
||||||
lvSum[r] = mysum[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
free(mysum,*theGridAccelerator);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
|||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
#elif defined(GRID_SYCL)
|
|
||||||
|
|
||||||
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
template<typename T> struct
|
|
||||||
sycl::is_device_copyable<T, typename std::enable_if<
|
|
||||||
Grid::isGridTensor<T>::value && (!std::is_trivially_copyable<T>::value),
|
|
||||||
void>::type>
|
|
||||||
: public std::true_type {};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
@ -210,8 +210,8 @@ void acceleratorInit(void)
|
|||||||
cl::sycl::gpu_selector selector;
|
cl::sycl::gpu_selector selector;
|
||||||
cl::sycl::device selectedDevice { selector };
|
cl::sycl::device selectedDevice { selector };
|
||||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||||
// theCopyAccelerator = new sycl::queue (selectedDevice);
|
theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||||
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||||
|
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
zeInit(0);
|
zeInit(0);
|
||||||
|
@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global)
|
|||||||
}
|
}
|
||||||
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
|
||||||
if ( ChecksumCommsSend ){
|
if ( ChecksumCommsSend ){
|
||||||
uint64_t *ubuf = (uint64_t *)buf;
|
uint64_t *ubuf = (uint64_t *)buf;
|
||||||
if(LoggingMode == LoggingModeNone) return;
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||||
if(LoggingMode == LoggingModePrint) {
|
if(LoggingMode == LoggingModePrint) {
|
||||||
@ -285,12 +288,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
|||||||
XmitLoggingCounter++;
|
XmitLoggingCounter++;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} else {
|
|
||||||
uint64_t word = 1;
|
|
||||||
deviceVector<uint64_t> dev(1);
|
|
||||||
acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
|
|
||||||
acceleratorCopySynchronise();
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||||
|
67
systems/Aurora/benchmarks/bench1.pbs
Normal file
67
systems/Aurora/benchmarks/bench1.pbs
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -q debug
|
||||||
|
#PBS -l select=1
|
||||||
|
#PBS -l walltime=00:20:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
module load pti-gpu
|
||||||
|
|
||||||
|
#cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 2 nodes, 24 ranks
|
||||||
|
#
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
#$CMD | tee 1node.comms
|
||||||
|
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
||||||
|
#$CMD | tee 1tile.dwf
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
$CMD | tee 1node.32.32.32.48.dwf
|
||||||
|
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
#$CMD | tee 1node.64.64.32.96.dwf
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
#$CMD | tee 1node.64.32.32.48.dwf
|
||||||
|
|
@ -1,10 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
#PBS -q workq
|
||||||
|
|
||||||
#PBS -q EarlyAppAccess
|
|
||||||
#PBS -l select=2
|
#PBS -l select=2
|
||||||
#PBS -l walltime=01:00:00
|
#PBS -l walltime=00:20:00
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
#export OMP_PROC_BIND=spread
|
#export OMP_PROC_BIND=spread
|
||||||
@ -13,11 +11,13 @@
|
|||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
source ../sourceme.sh
|
||||||
|
module load pti-gpu
|
||||||
|
|
||||||
export OMP_NUM_THREADS=3
|
#cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
|||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 2 nodes, 24 ranks
|
||||||
|
#
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
|
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
$CMD | tee 2node.comms
|
||||||
|
|
||||||
#$CMD
|
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
$CMD | tee 2node.32.32.64.48.dwf
|
||||||
|
|
||||||
#$CMD
|
|
||||||
|
|
||||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
$CMD | tee 2node.64.64.64.96.dwf
|
||||||
|
|
||||||
$CMD
|
|
||||||
|
|
||||||
CMD="mpiexec -np 1 -ppn 1 -envall \
|
|
||||||
./gpu_tile_compact.sh \
|
|
||||||
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
|
||||||
|
|
||||||
$CMD
|
|
@ -1,33 +1,34 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
|
||||||
#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
|
||||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
|
||||||
export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
|
|
||||||
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
|
||||||
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
|
||||||
|
|
||||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
|
||||||
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
|
|
||||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||||
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
|
|
||||||
|
|
||||||
#export GRID_MPICH_NIC_BIND=$NIC
|
|
||||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
|
|
||||||
|
|
||||||
unset EnableWalkerPartition
|
unset EnableWalkerPartition
|
||||||
export EnableImplicitScaling=0
|
export EnableImplicitScaling=0
|
||||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
export ZE_AFFINITY_MASK=$gpu_id
|
||||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||||
|
|
||||||
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||||
|
|
||||||
#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||||
|
|
||||||
numactl -m $NUMA -N $NUMAP "$@"
|
if [ $PALS_RANKID = "0" ]
|
||||||
|
then
|
||||||
|
# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@"
|
||||||
|
# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||||
|
numactl -m $NUMA -N $NUMA "$@"
|
||||||
|
else
|
||||||
|
numactl -m $NUMA -N $NUMA "$@"
|
||||||
|
fi
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=sycl \
|
--enable-accelerator=sycl \
|
||||||
--enable-accelerator-aware-mpi=no\
|
--enable-accelerator-aware-mpi=yes\
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||||
|
|
||||||
module use /soft/modulefiles
|
module load oneapi/release/2023.12.15.001
|
||||||
module load intel_compute_runtime/release/agama-devel-682.22
|
|
||||||
|
#module use /soft/modulefiles
|
||||||
|
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||||
|
|
||||||
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
export FI_CXI_CQ_FILL_PERCENT=20
|
export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
Loading…
Reference in New Issue
Block a user