mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-24 02:32:02 +01:00
Compare commits
7 Commits
4ea48ef0c4
...
release/0.
Author | SHA1 | Date | |
---|---|---|---|
a00ae981e0 | |||
3f2fd49db4 | |||
0efa107cb6 | |||
8feedb4f6f | |||
05e562e3d7 | |||
dd3bbb8fa2 | |||
2fbcf13c46 |
@ -401,8 +401,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
// std::cout << "Copy Synchronised\n"<<std::endl;
|
// std::cout << "Copy Synchronised\n"<<std::endl;
|
||||||
acceleratorCopySynchronise();
|
|
||||||
|
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
|
@ -36,10 +36,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
#include <hip/hip_runtime_api.h>
|
#include <hip/hip_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCl
|
#ifdef GRID_SYCL
|
||||||
|
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#define header "SharedMemoryMpi: "
|
#define header "SharedMemoryMpi: "
|
||||||
/*Construct from an MPI communicator*/
|
/*Construct from an MPI communicator*/
|
||||||
|
@ -459,11 +459,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
#ifdef SYCL_HACK
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
|
|
||||||
#else
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#endif
|
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||||
#endif
|
#endif
|
||||||
@ -474,6 +470,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
@ -498,10 +495,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
#endif
|
#endif
|
||||||
acceleratorFenceComputeStream();
|
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
#endif
|
#endif
|
||||||
|
@ -398,6 +398,8 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// Buffers are gathered AND synchronised
|
||||||
|
// Copies are MPI ISend OR asynch copy on copy stream
|
||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
@ -410,14 +412,18 @@ public:
|
|||||||
comms_bytes+=bytes;
|
comms_bytes+=bytes;
|
||||||
shm_bytes +=2*Packets[i].bytes-bytes;
|
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||||
}
|
}
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// complete intranode
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
// complete MPI
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
||||||
}
|
}
|
||||||
|
// Everyone agrees we are all done
|
||||||
|
_grid->StencilBarrier();
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
@ -425,33 +431,9 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void Communicate(void)
|
void Communicate(void)
|
||||||
{
|
{
|
||||||
if ( 0 ){
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
thread_region {
|
this->CommunicateBegin(reqs);
|
||||||
// must be called in parallel region
|
this->CommunicateComplete(reqs);
|
||||||
int mythread = thread_num();
|
|
||||||
int maxthreads= thread_max();
|
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
|
||||||
assert(nthreads <= maxthreads);
|
|
||||||
if (nthreads == -1) nthreads = 1;
|
|
||||||
if (mythread < nthreads) {
|
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
|
||||||
double start = usecond();
|
|
||||||
uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,
|
|
||||||
Packets[i].bytes,i);
|
|
||||||
comm_bytes_thr[mythread] += bytes;
|
|
||||||
shm_bytes_thr[mythread] += Packets[i].bytes - bytes;
|
|
||||||
comm_time_thr[mythread] += usecond() - start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else { // Concurrent and non-threaded asynch calls to MPI
|
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
|
||||||
this->CommunicateBegin(reqs);
|
|
||||||
this->CommunicateComplete(reqs);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -527,7 +509,6 @@ public:
|
|||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
mpi3synctime_g+=usecond();
|
mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
// conformable(source.Grid(),_grid);
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
halogtime-=usecond();
|
halogtime-=usecond();
|
||||||
|
|
||||||
@ -586,13 +567,8 @@ public:
|
|||||||
CommsMerge(decompress,Mergers,Decompressions);
|
CommsMerge(decompress,Mergers,Decompressions);
|
||||||
}
|
}
|
||||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||||
mpi3synctime-=usecond();
|
assert(MergersSHM.size()==0);
|
||||||
accelerator_barrier();
|
assert(DecompressionsSHM.size()==0);
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
|
||||||
mpi3synctime+=usecond();
|
|
||||||
shmmergetime-=usecond();
|
|
||||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
|
||||||
shmmergetime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class decompressor>
|
template<class decompressor>
|
||||||
@ -609,6 +585,7 @@ public:
|
|||||||
decompress.Exchange(mp,vp0,vp1,type,o);
|
decompress.Exchange(mp,vp0,vp1,type,o);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
if ( mm.size() ) acceleratorFenceComputeStream();
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
|
|
||||||
decompresstime-=usecond();
|
decompresstime-=usecond();
|
||||||
@ -619,7 +596,9 @@ public:
|
|||||||
decompress.Decompress(kp,mp,o);
|
decompress.Decompress(kp,mp,o);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
if ( dd.size() ) acceleratorFenceComputeStream();
|
||||||
decompresstime+=usecond();
|
decompresstime+=usecond();
|
||||||
|
|
||||||
}
|
}
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Set up routines
|
// Set up routines
|
||||||
|
@ -249,14 +249,16 @@ inline int acceleratorIsCommunicable(void *ptr)
|
|||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
#if 0
|
||||||
#include <CL/sycl.hpp>
|
#include <CL/sycl.hpp>
|
||||||
#include <CL/sycl/usm.hpp>
|
#include <CL/sycl/usm.hpp>
|
||||||
|
|
||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
|
||||||
|
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
|
||||||
#include <level_zero/ze_api.h>
|
#include <level_zero/ze_api.h>
|
||||||
#include <CL/sycl/backend/level_zero.hpp>
|
#include <CL/sycl/backend/level_zero.hpp>
|
||||||
|
#else
|
||||||
|
#include <sycl/CL/sycl.hpp>
|
||||||
|
#include <sycl/usm.hpp>
|
||||||
|
#include <level_zero/ze_api.h>
|
||||||
|
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
||||||
#endif
|
#endif
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
#SBATCH -p QZ1J-ICX-PVC
|
#SBATCH -p QZ1J-ICX-PVC
|
||||||
##SBATCH -p QZ1J-SPR-PVC-2C
|
##SBATCH -p QZ1J-SPR-PVC-2C
|
||||||
|
|
||||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
|
|
||||||
export NT=8
|
export NT=8
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#SBATCH -p QZ1J-ICX-PVC
|
#SBATCH -p QZ1J-ICX-PVC
|
||||||
|
|
||||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
|
|
||||||
export NT=16
|
export NT=16
|
||||||
|
|
||||||
@ -19,16 +19,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
|
|||||||
export I_MPI_OFFLOAD_CELL=tile
|
export I_MPI_OFFLOAD_CELL=tile
|
||||||
export EnableImplicitScaling=0
|
export EnableImplicitScaling=0
|
||||||
export EnableWalkerPartition=0
|
export EnableWalkerPartition=0
|
||||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||||
|
|
||||||
for i in 0
|
for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||||
do
|
do
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i
|
||||||
done
|
done
|
||||||
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
|
|
||||||
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
|
|||||||
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
||||||
|
|
||||||
|
|
||||||
if [ $MPI_LOCALRANKID = "0" ]
|
#if [ $MPI_LOCALRANKID = "0" ]
|
||||||
then
|
#then
|
||||||
# ~psteinbr/build_pti/ze_tracer -h $@
|
# ~psteinbr/build_pti/ze_tracer -h $@
|
||||||
onetrace --chrome-device-timeline $@
|
# onetrace --chrome-device-timeline $@
|
||||||
else
|
#else
|
||||||
$@
|
$@
|
||||||
fi
|
#fi
|
||||||
|
Reference in New Issue
Block a user