mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Merge branch 'develop' into feature/dirichlet
This commit is contained in:
commit
1177b8f661
@ -262,7 +262,7 @@ public:
|
|||||||
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
||||||
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||||
});
|
});
|
||||||
|
@ -264,7 +264,7 @@ public:
|
|||||||
auto Tnp_v = Tnp->View();
|
auto Tnp_v = Tnp->View();
|
||||||
auto Tnm_v = Tnm->View();
|
auto Tnm_v = Tnm->View();
|
||||||
constexpr int Nsimd = vector_type::Nsimd();
|
constexpr int Nsimd = vector_type::Nsimd();
|
||||||
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
|
accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||||
});
|
});
|
||||||
|
@ -395,12 +395,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
// if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||||
* this->StencilSendToRecvFromComplete(list,dir);
|
// this->StencilSendToRecvFromComplete(list,dir);
|
||||||
* list.resize(0);
|
// }
|
||||||
* }
|
|
||||||
*/
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
|
@ -490,6 +490,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
#endif
|
#endif
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||||
@ -497,11 +498,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||||
#endif
|
#endif
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
|
@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
typedef Lattice<SiteLink> LinkField;
|
typedef Lattice<SiteLink> LinkField;
|
||||||
typedef Lattice<SiteField> Field;
|
typedef Lattice<SiteField> Field;
|
||||||
typedef Field ComplexField;
|
typedef LinkField ComplexField;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef QedGImpl<vComplex> QedGImplR;
|
typedef QedGImpl<vComplex> QedGImplR;
|
||||||
|
@ -201,12 +201,15 @@ void acceleratorInit(void)
|
|||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
|
|
||||||
cl::sycl::queue *theGridAccelerator;
|
cl::sycl::queue *theGridAccelerator;
|
||||||
|
cl::sycl::queue *theCopyAccelerator;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
cl::sycl::gpu_selector selector;
|
cl::sycl::gpu_selector selector;
|
||||||
cl::sycl::device selectedDevice { selector };
|
cl::sycl::device selectedDevice { selector };
|
||||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||||
|
// theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||||
|
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||||
|
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
zeInit(0);
|
zeInit(0);
|
||||||
|
@ -248,7 +248,6 @@ inline int acceleratorIsCommunicable(void *ptr)
|
|||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// SyCL acceleration
|
// SyCL acceleration
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#include <CL/sycl.hpp>
|
#include <CL/sycl.hpp>
|
||||||
@ -263,6 +262,7 @@ NAMESPACE_END(Grid);
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern cl::sycl::queue *theGridAccelerator;
|
extern cl::sycl::queue *theGridAccelerator;
|
||||||
|
extern cl::sycl::queue *theCopyAccelerator;
|
||||||
|
|
||||||
#ifdef __SYCL_DEVICE_ONLY__
|
#ifdef __SYCL_DEVICE_ONLY__
|
||||||
#define GRID_SIMT
|
#define GRID_SIMT
|
||||||
@ -290,7 +290,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
cgh.parallel_for( \
|
cgh.parallel_for( \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
cl::sycl::nd_range<3>(global,local), \
|
||||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||||
[[intel::reqd_sub_group_size(8)]] \
|
[[intel::reqd_sub_group_size(16)]] \
|
||||||
{ \
|
{ \
|
||||||
auto iter1 = item.get_global_id(0); \
|
auto iter1 = item.get_global_id(0); \
|
||||||
auto iter2 = item.get_global_id(1); \
|
auto iter2 = item.get_global_id(1); \
|
||||||
@ -299,19 +299,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
}); \
|
}); \
|
||||||
});
|
});
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) theGridAccelerator->wait();
|
#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); }
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
|
|
||||||
theGridAccelerator->memcpy(to,from,bytes);
|
inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
|
||||||
}
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
@ -514,7 +514,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
|
|||||||
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Fencing needed ONLY for SYCL
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
|
||||||
|
#else
|
||||||
|
// Ordering within a stream guaranteed on Nvidia & AMD
|
||||||
|
inline void acceleratorFenceComputeStream(void){ };
|
||||||
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// Synchronise across local threads for divergence resynch
|
// Synchronise across local threads for divergence resynch
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <openssl/sha.h>
|
#include <openssl/sha.h>
|
||||||
|
#include <openssl/evp.h>
|
||||||
}
|
}
|
||||||
#ifdef USE_IPP
|
#ifdef USE_IPP
|
||||||
#include "ipp.h"
|
#include "ipp.h"
|
||||||
@ -70,10 +71,8 @@ public:
|
|||||||
static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
|
static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
|
||||||
{
|
{
|
||||||
std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
|
std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
|
||||||
SHA256_CTX sha256;
|
auto digest = EVP_get_digestbyname("SHA256");
|
||||||
SHA256_Init (&sha256);
|
EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL);
|
||||||
SHA256_Update(&sha256, data,bytes);
|
|
||||||
SHA256_Final (&hash[0], &sha256);
|
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
static inline std::vector<int> sha256_seeds(const std::string &s)
|
static inline std::vector<int> sha256_seeds(const std::string &s)
|
||||||
|
@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`.
|
|||||||
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
|
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
|
||||||
- `--enable-numa`: enable NUMA first touch optimisation
|
- `--enable-numa`: enable NUMA first touch optimisation
|
||||||
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
|
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
|
||||||
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
|
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes).
|
||||||
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
|
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
|
||||||
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
|
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
|
||||||
- `--disable-timers`: disable system dependent high-resolution timers.
|
- `--disable-timers`: disable system dependent high-resolution timers.
|
||||||
|
62
systems/PVC/benchmarks/run-1tile.sh
Normal file
62
systems/PVC/benchmarks/run-1tile.sh
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
##SBATCH -p PVC-SPR-QZEH
|
||||||
|
##SBATCH -p PVC-ICX-QZNW
|
||||||
|
#SBATCH -p QZ1J-ICX-PVC
|
||||||
|
##SBATCH -p QZ1J-SPR-PVC-2C
|
||||||
|
|
||||||
|
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
|
|
||||||
|
export NT=16
|
||||||
|
|
||||||
|
export I_MPI_OFFLOAD=1
|
||||||
|
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||||
|
export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
|
||||||
|
|
||||||
|
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||||
|
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||||
|
export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||||
|
#export IGC_ShaderDumpEnable=1
|
||||||
|
#export IGC_DumpToCurrentDir=1
|
||||||
|
export I_MPI_OFFLOAD_CELL=tile
|
||||||
|
export EnableImplicitScaling=0
|
||||||
|
export EnableWalkerPartition=0
|
||||||
|
export ZE_AFFINITY_MASK=0.0
|
||||||
|
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=0
|
||||||
|
export I_MPI_OFFLOAD_CELL=device
|
||||||
|
export EnableImplicitScaling=1
|
||||||
|
export EnableWalkerPartition=1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||||
|
|
||||||
|
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||||
|
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1
|
||||||
|
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
|
||||||
|
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||||
|
|
||||||
|
#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||||
|
#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1
|
||||||
|
|
26
systems/PVC/benchmarks/run-2tile-mpi.sh
Executable file
26
systems/PVC/benchmarks/run-2tile-mpi.sh
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
##SBATCH -p PVC-SPR-QZEH
|
||||||
|
##SBATCH -p PVC-ICX-QZNW
|
||||||
|
#SBATCH -p QZ1J-ICX-PVC
|
||||||
|
|
||||||
|
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
|
|
||||||
|
export NT=16
|
||||||
|
|
||||||
|
|
||||||
|
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||||
|
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||||
|
#export IGC_ShaderDumpEnable=1
|
||||||
|
#export IGC_DumpToCurrentDir=1
|
||||||
|
export I_MPI_OFFLOAD=1
|
||||||
|
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||||
|
export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
|
||||||
|
export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||||
|
export I_MPI_OFFLOAD_CELL=tile
|
||||||
|
export EnableImplicitScaling=0
|
||||||
|
export EnableWalkerPartition=0
|
||||||
|
|
||||||
|
mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
|
||||||
|
|
||||||
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
|
||||||
|
|
14
systems/PVC/benchmarks/wrap.sh
Executable file
14
systems/PVC/benchmarks/wrap.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
|
||||||
|
|
||||||
|
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
||||||
|
|
||||||
|
|
||||||
|
if [ $MPI_LOCALRANKID = "0" ]
|
||||||
|
then
|
||||||
|
# ~psteinbr/build_pti/ze_tracer -c $@
|
||||||
|
onetrace --chrome-kernel-timeline $@
|
||||||
|
else
|
||||||
|
$@
|
||||||
|
fi
|
15
systems/PVC/config-command
Normal file
15
systems/PVC/config-command
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
INSTALL=/nfs/site/home/azusayax/install
|
||||||
|
../../configure \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-comms=mpi \
|
||||||
|
--disable-accelerator-cshift \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-accelerator=sycl \
|
||||||
|
--enable-unified=yes \
|
||||||
|
CXX=mpicxx \
|
||||||
|
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
|
||||||
|
CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
|
||||||
|
|
11
systems/PVC/setup.sh
Normal file
11
systems/PVC/setup.sh
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
export https_proxy=http://proxy-chain.intel.com:911
|
||||||
|
export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
module load intel-release
|
||||||
|
source /opt/intel/oneapi/PVC_setup.sh
|
||||||
|
#source /opt/intel/oneapi/ATS_setup.sh
|
||||||
|
module load intel/mpich/pvc45.3
|
||||||
|
export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH
|
||||||
|
|
||||||
|
#clsh embargo-ci-neo-022845
|
||||||
|
#source /opt/intel/vtune_amplifier/amplxe-vars.sh
|
@ -793,6 +793,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
std::cout <<" OK ! "<<std::endl;
|
std::cout <<" OK ! "<<std::endl;
|
||||||
|
|
||||||
|
#ifdef USE_FP16
|
||||||
// Double to Half
|
// Double to Half
|
||||||
std::cout << GridLogMessage<< "Double to half" ;
|
std::cout << GridLogMessage<< "Double to half" ;
|
||||||
precisionChange(&H[0],&D[0],Ndp);
|
precisionChange(&H[0],&D[0],Ndp);
|
||||||
@ -822,6 +823,7 @@ int main (int argc, char ** argv)
|
|||||||
assert( tmp < 1.0e-3 );
|
assert( tmp < 1.0e-3 );
|
||||||
}
|
}
|
||||||
std::cout <<" OK ! "<<std::endl;
|
std::cout <<" OK ! "<<std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user