mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge branch 'develop' into feature/scidac-wp1
This commit is contained in:
commit
5c3ace7c3e
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define _GRID_FFT_H_
|
#define _GRID_FFT_H_
|
||||||
|
|
||||||
#ifdef HAVE_FFTW
|
#ifdef HAVE_FFTW
|
||||||
#ifdef USE_MKL
|
#if defined(USE_MKL) || defined(GRID_SYCL)
|
||||||
#include <fftw/fftw3.h>
|
#include <fftw/fftw3.h>
|
||||||
#else
|
#else
|
||||||
#include <fftw3.h>
|
#include <fftw3.h>
|
||||||
|
@ -386,6 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
return offbytes;
|
return offbytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
@ -418,9 +419,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
}
|
}
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
|
assert(shm!=NULL);
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
@ -428,9 +435,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
} else {
|
} else {
|
||||||
|
#ifndef NVLINK_GET
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -440,6 +450,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
|
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
|
|||||||
uint64_t GlobalSharedMemory::_ShmAllocBytes;
|
uint64_t GlobalSharedMemory::_ShmAllocBytes;
|
||||||
|
|
||||||
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
|
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void * GlobalSharedMemory::HostCommBuf;
|
||||||
|
#endif
|
||||||
|
|
||||||
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
|
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
|
||||||
int GlobalSharedMemory::WorldShmRank;
|
int GlobalSharedMemory::WorldShmRank;
|
||||||
@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
|
|||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void *SharedMemory::HostBufferMalloc(size_t bytes){
|
||||||
|
void *ptr = (void *)host_heap_top;
|
||||||
|
host_heap_top += bytes;
|
||||||
|
host_heap_bytes+= bytes;
|
||||||
|
if (host_heap_bytes >= host_heap_size) {
|
||||||
|
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
|
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
assert(host_heap_bytes<host_heap_size);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void SharedMemory::HostBufferFreeAll(void) {
|
||||||
|
host_heap_top =(size_t)HostCommBuf;
|
||||||
|
host_heap_bytes=0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
||||||
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
||||||
void *ptr = (void *)heap_top;
|
void *ptr = (void *)heap_top;
|
||||||
|
@ -75,7 +75,9 @@ public:
|
|||||||
static int Hugepages;
|
static int Hugepages;
|
||||||
|
|
||||||
static std::vector<void *> WorldShmCommBufs;
|
static std::vector<void *> WorldShmCommBufs;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
static void *HostCommBuf;
|
||||||
|
#endif
|
||||||
static Grid_MPI_Comm WorldComm;
|
static Grid_MPI_Comm WorldComm;
|
||||||
static int WorldRank;
|
static int WorldRank;
|
||||||
static int WorldSize;
|
static int WorldSize;
|
||||||
@ -120,6 +122,13 @@ private:
|
|||||||
size_t heap_bytes;
|
size_t heap_bytes;
|
||||||
size_t heap_size;
|
size_t heap_size;
|
||||||
|
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
size_t host_heap_top; // set in free all
|
||||||
|
size_t host_heap_bytes;// set in free all
|
||||||
|
void *HostCommBuf; // set in SetCommunicator
|
||||||
|
size_t host_heap_size; // set in SetCommunicator
|
||||||
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
Grid_MPI_Comm ShmComm; // for barriers
|
Grid_MPI_Comm ShmComm; // for barriers
|
||||||
@ -151,7 +160,10 @@ public:
|
|||||||
void *ShmBufferTranslate(int rank,void * local_p);
|
void *ShmBufferTranslate(int rank,void * local_p);
|
||||||
void *ShmBufferMalloc(size_t bytes);
|
void *ShmBufferMalloc(size_t bytes);
|
||||||
void ShmBufferFreeAll(void) ;
|
void ShmBufferFreeAll(void) ;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void *HostBufferMalloc(size_t bytes);
|
||||||
|
void HostBufferFreeAll(void);
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Make info on Nodes & ranks and Shared memory available
|
// Make info on Nodes & ranks and Shared memory available
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#include <hip/hip_runtime_api.h>
|
#include <hip/hip_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
|
#define SHM_SOCKETS
|
||||||
|
#endif
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#define SHM_SOCKETS
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
//if defined(GRID_SYCL)
|
|
||||||
#if 0
|
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|
||||||
{
|
|
||||||
void * ShmCommBuf ;
|
|
||||||
assert(_ShmSetup==1);
|
|
||||||
assert(_ShmAlloc==0);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// allocate the pointer array for shared windows for our group
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
MPI_Barrier(WorldShmComm);
|
|
||||||
WorldShmCommBufs.resize(WorldShmSize);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Each MPI rank should allocate our own buffer
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
|
||||||
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
|
||||||
|
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
|
||||||
|
|
||||||
assert(WorldShmSize == 1);
|
|
||||||
for(int r=0;r<WorldShmSize;r++){
|
|
||||||
WorldShmCommBufs[r] = ShmCommBuf;
|
|
||||||
}
|
|
||||||
_ShmAllocBytes=bytes;
|
|
||||||
_ShmAlloc=1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
|
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
HostCommBuf= malloc(bytes);
|
||||||
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
_ShmAlloc=1;
|
_ShmAlloc=1;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#ifdef GRID_MPI3_SHMMMAP
|
#ifdef GRID_MPI3_SHMMMAP
|
||||||
@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
}
|
}
|
||||||
ShmBufferFreeAll();
|
ShmBufferFreeAll();
|
||||||
|
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
host_heap_size = heap_size;
|
||||||
|
HostCommBuf= GlobalSharedMemory::HostCommBuf;
|
||||||
|
HostBufferFreeAll();
|
||||||
|
#endif
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// find comm ranks in our SHM group (i.e. which ranks are on our node)
|
// find comm ranks in our SHM group (i.e. which ranks are on our node)
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
@ -302,12 +302,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t csum=0;
|
||||||
|
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||||
|
{
|
||||||
|
// Hack
|
||||||
|
// Fast integer xor checksum. Can also be used in comms now.
|
||||||
|
autoView(l_v,left,AcceleratorRead);
|
||||||
|
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||||
|
uint64_t *base= (uint64_t *)&l_v[0];
|
||||||
|
csum=svm_xor(base,words);
|
||||||
|
}
|
||||||
|
FlightRecorder::CsumLog(csum);
|
||||||
|
#endif
|
||||||
ComplexD nrm = rankInnerProduct(left,right);
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
// std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
|
RealD local = real(nrm);
|
||||||
|
FlightRecorder::NormLog(real(nrm));
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
/*
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
|
|
||||||
{
|
{
|
||||||
Double sumResult; zeroit(sumResult);
|
Word xorResult; xorResult = 0;
|
||||||
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
|
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
|
||||||
Double identity; zeroit(identity);
|
Word identity; identity=0;
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
|
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
|
||||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||||
sum +=vec[index];
|
sum ^=vec[index];
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
Double ret = d_sum[0];
|
Word ret = d_sum[0];
|
||||||
free(d_sum,*theGridAccelerator);
|
free(d_sum,*theGridAccelerator);
|
||||||
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
|
@ -416,7 +416,7 @@ public:
|
|||||||
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
||||||
SeedFixedIntegers(seeds);
|
SeedFixedIntegers(seeds);
|
||||||
}
|
}
|
||||||
void SeedFixedIntegers(const std::vector<int> &seeds){
|
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
|
||||||
|
|
||||||
// Everyone generates the same seed_seq based on input seeds
|
// Everyone generates the same seed_seq based on input seeds
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
||||||
@ -433,7 +433,6 @@ public:
|
|||||||
// MT implementation does not implement fast discard even though
|
// MT implementation does not implement fast discard even though
|
||||||
// in principle this is possible
|
// in principle this is possible
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
#if 1
|
|
||||||
thread_for( lidx, _grid->lSites(), {
|
thread_for( lidx, _grid->lSites(), {
|
||||||
|
|
||||||
int64_t gidx;
|
int64_t gidx;
|
||||||
@ -454,29 +453,12 @@ public:
|
|||||||
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
int l_idx=generator_idx(o_idx,i_idx);
|
||||||
_generators[l_idx] = master_engine;
|
_generators[l_idx] = master_engine;
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
if ( britney ) {
|
||||||
});
|
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
|
||||||
#else
|
} else {
|
||||||
// Everybody loops over global volume.
|
|
||||||
thread_for( gidx, _grid->_gsites, {
|
|
||||||
|
|
||||||
// Where is it?
|
|
||||||
int rank;
|
|
||||||
int o_idx;
|
|
||||||
int i_idx;
|
|
||||||
|
|
||||||
Coordinate gcoor;
|
|
||||||
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
|
||||||
|
|
||||||
// If this is one of mine we take it
|
|
||||||
if( rank == _grid->ThisRank() ){
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
|
||||||
_generators[l_idx] = master_engine;
|
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Machine and thread decomposition dependent seeding is efficient
|
// Machine and thread decomposition dependent seeding is efficient
|
||||||
|
@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v , st,AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v ,st,AcceleratorRead);
|
autoView(st_v ,st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
|
@ -70,57 +70,6 @@ struct DefaultImplParams {
|
|||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table);
|
int off,std::vector<std::pair<int,int> > & table);
|
||||||
|
|
||||||
/*
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
|
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
|
||||||
{
|
|
||||||
int num=table.size();
|
|
||||||
std::pair<int,int> *table_v = & table[0];
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
accelerator_forNB( i,num, vobj::Nsimd(), {
|
|
||||||
compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
template<class cobj,class vobj,class compressor>
|
|
||||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
|
||||||
commVector<cobj *> pointers,
|
|
||||||
int dimension,int plane,
|
|
||||||
int cbmask,compressor &compress,int type) __attribute__((noinline));
|
|
||||||
|
|
||||||
template<class cobj,class vobj,class compressor>
|
|
||||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type)
|
|
||||||
{
|
|
||||||
assert( (table.size()&0x1)==0);
|
|
||||||
int num=table.size()/2;
|
|
||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
auto rhs_p = &rhs_v[0];
|
|
||||||
auto p0=&pointers[0][0];
|
|
||||||
auto p1=&pointers[1][0];
|
|
||||||
auto tp=&table[0];
|
|
||||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
|
||||||
compress.CompressExchange(p0,p1, rhs_p, j,
|
|
||||||
so+tp[2*j ].second,
|
|
||||||
so+tp[2*j+1].second,
|
|
||||||
type);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
void DslashResetCounts(void);
|
void DslashResetCounts(void);
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||||
void DslashLogFull(void);
|
void DslashLogFull(void);
|
||||||
@ -258,6 +207,10 @@ public:
|
|||||||
struct Packet {
|
struct Packet {
|
||||||
void * send_buf;
|
void * send_buf;
|
||||||
void * recv_buf;
|
void * recv_buf;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
|
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
|
#endif
|
||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer do_send;
|
Integer do_send;
|
||||||
@ -324,7 +277,7 @@ public:
|
|||||||
Vector<int> surface_list;
|
Vector<int> surface_list;
|
||||||
|
|
||||||
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
||||||
commVector<StencilEntry> _entries_device; // Resident in managed memory
|
commVector<StencilEntry> _entries_device; // Resident in device memory
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
std::vector<Merge> MergersSHM;
|
std::vector<Merge> MergersSHM;
|
||||||
@ -408,33 +361,16 @@ public:
|
|||||||
// Use OpenMP Tasks for cleaner ???
|
// Use OpenMP Tasks for cleaner ???
|
||||||
// must be called *inside* parallel region
|
// must be called *inside* parallel region
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
/*
|
|
||||||
void CommunicateThreaded()
|
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
int mythread = omp_get_thread_num();
|
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
|
||||||
#else
|
|
||||||
int mythread = 0;
|
|
||||||
int nthreads = 1;
|
|
||||||
#endif
|
|
||||||
if (nthreads == -1) nthreads = 1;
|
|
||||||
if (mythread < nthreads) {
|
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
|
||||||
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,
|
|
||||||
Packets[i].bytes,i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Non blocking send and receive. Necessarily parallel.
|
// Non blocking send and receive. Necessarily parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// All GPU kernel tasks must complete
|
||||||
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
|
// But the HaloGather had a barrier too.
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@ -443,16 +379,54 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
|
||||||
|
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
|
||||||
|
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
|
||||||
|
if ( Packets[i].do_send ) {
|
||||||
|
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
|
||||||
|
}
|
||||||
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
|
Packets[i].host_send_buf,
|
||||||
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
|
Packets[i].host_recv_buf,
|
||||||
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// Get comms started then run checksums
|
||||||
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_send )
|
||||||
|
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
acceleratorCopySynchronise();
|
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
|
||||||
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
_grid->StencilBarrier();
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_recv ) {
|
||||||
|
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_grid->HostBufferFreeAll();
|
||||||
|
#endif
|
||||||
|
// run any checksums
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_recv )
|
||||||
|
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Blocking send and receive. Either sequential or parallel.
|
// Blocking send and receive. Either sequential or parallel.
|
||||||
@ -528,6 +502,7 @@ public:
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -540,10 +515,9 @@ public:
|
|||||||
compress.Point(point);
|
compress.Point(point);
|
||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier();
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
@ -579,6 +553,7 @@ public:
|
|||||||
accelerator_forNB(j, words, cobj::Nsimd(), {
|
accelerator_forNB(j, words, cobj::Nsimd(), {
|
||||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||||
});
|
});
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -669,6 +644,7 @@ public:
|
|||||||
for(int i=0;i<dd.size();i++){
|
for(int i=0;i<dd.size();i++){
|
||||||
decompressor::DecompressFace(decompress,dd[i]);
|
decompressor::DecompressFace(decompress,dd[i]);
|
||||||
}
|
}
|
||||||
|
acceleratorFenceComputeStream(); // dependent kernels
|
||||||
}
|
}
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Set up routines
|
// Set up routines
|
||||||
@ -1224,7 +1200,6 @@ public:
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||||
|
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, do_send,
|
xmit_to_rank, do_send,
|
||||||
|
@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
template<typename T> struct
|
|
||||||
sycl::is_device_copyable<T, typename std::enable_if<
|
|
||||||
Grid::isGridTensor<T>::value && (!std::is_trivially_copyable<T>::value),
|
|
||||||
void>::type>
|
|
||||||
: public std::true_type {};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
339
Grid/util/FlightRecorder.cc
Normal file
339
Grid/util/FlightRecorder.cc
Normal file
@ -0,0 +1,339 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/Init.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Grid Norm logging for repro testing
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int FlightRecorder::PrintEntireLog;
|
||||||
|
int FlightRecorder::ContinueOnFail;
|
||||||
|
int FlightRecorder::LoggingMode;
|
||||||
|
int FlightRecorder::ChecksumComms;
|
||||||
|
int FlightRecorder::ChecksumCommsSend;
|
||||||
|
int32_t FlightRecorder::XmitLoggingCounter;
|
||||||
|
int32_t FlightRecorder::RecvLoggingCounter;
|
||||||
|
int32_t FlightRecorder::CsumLoggingCounter;
|
||||||
|
int32_t FlightRecorder::NormLoggingCounter;
|
||||||
|
int32_t FlightRecorder::ReductionLoggingCounter;
|
||||||
|
uint64_t FlightRecorder::ErrorCounter;
|
||||||
|
std::vector<double> FlightRecorder::NormLogVector;
|
||||||
|
std::vector<double> FlightRecorder::ReductionLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::CsumLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::XmitLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::RecvLogVector;
|
||||||
|
|
||||||
|
void FlightRecorder::ResetCounters(void)
|
||||||
|
{
|
||||||
|
XmitLoggingCounter=0;
|
||||||
|
RecvLoggingCounter=0;
|
||||||
|
CsumLoggingCounter=0;
|
||||||
|
NormLoggingCounter=0;
|
||||||
|
ReductionLoggingCounter=0;
|
||||||
|
}
|
||||||
|
void FlightRecorder::Truncate(void)
|
||||||
|
{
|
||||||
|
ResetCounters();
|
||||||
|
XmitLogVector.resize(0);
|
||||||
|
RecvLogVector.resize(0);
|
||||||
|
NormLogVector.resize(0);
|
||||||
|
CsumLogVector.resize(0);
|
||||||
|
ReductionLogVector.resize(0);
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
|
||||||
|
{
|
||||||
|
switch ( mode ) {
|
||||||
|
case LoggingModePrint:
|
||||||
|
SetLoggingModePrint();
|
||||||
|
break;
|
||||||
|
case LoggingModeRecord:
|
||||||
|
SetLoggingModeRecord();
|
||||||
|
break;
|
||||||
|
case LoggingModeVerify:
|
||||||
|
SetLoggingModeVerify();
|
||||||
|
break;
|
||||||
|
case LoggingModeNone:
|
||||||
|
LoggingMode = mode;
|
||||||
|
Truncate();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void FlightRecorder::SetLoggingModePrint(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to print output " <<std::endl;
|
||||||
|
Truncate();
|
||||||
|
LoggingMode = LoggingModePrint;
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingModeRecord(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to RECORD " <<std::endl;
|
||||||
|
Truncate();
|
||||||
|
LoggingMode = LoggingModeRecord;
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingModeVerify(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
|
||||||
|
ResetCounters();
|
||||||
|
LoggingMode = LoggingModeVerify;
|
||||||
|
}
|
||||||
|
uint64_t FlightRecorder::ErrorCount(void)
|
||||||
|
{
|
||||||
|
return ErrorCounter;
|
||||||
|
}
|
||||||
|
void FlightRecorder::NormLog(double value)
|
||||||
|
{
|
||||||
|
uint64_t hex = * ( (uint64_t *)&value );
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
NormLogVector.push_back(value);
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
|
if(NormLoggingCounter < NormLogVector.size()){
|
||||||
|
uint64_t hexref = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
|
||||||
|
|
||||||
|
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
||||||
|
|
||||||
|
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
||||||
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
||||||
|
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
||||||
|
|
||||||
|
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
|
||||||
|
|
||||||
|
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
NormLoggingCounter,NormLogVector.size(),
|
||||||
|
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail)assert(0); // Force takedown of job
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
|
||||||
|
<<" "<<hex<<" "<<hexref
|
||||||
|
<<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if ( NormLogVector.size()==NormLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
|
||||||
|
}
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::CsumLog(uint64_t hex)
|
||||||
|
{
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
CsumLogVector.push_back(hex);
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
|
if(CsumLoggingCounter < CsumLogVector.size()) {
|
||||||
|
|
||||||
|
uint64_t hexref = CsumLogVector[CsumLoggingCounter] ;
|
||||||
|
|
||||||
|
if ( hex != hexref ) {
|
||||||
|
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
||||||
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
CsumLoggingCounter,hex, hexref);
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail) assert(0); // Force takedown of job
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
|
||||||
|
<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( CsumLogVector.size()==CsumLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
|
||||||
|
}
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::ReductionLog(double local,double global)
|
||||||
|
{
|
||||||
|
uint64_t hex_l = * ( (uint64_t *)&local );
|
||||||
|
uint64_t hex_g = * ( (uint64_t *)&global );
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
|
||||||
|
ReductionLogVector.push_back(global);
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
||||||
|
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
ReductionLoggingCounter,ReductionLogVector.size(),
|
||||||
|
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
|
||||||
|
}
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||||
|
{
|
||||||
|
if ( ChecksumCommsSend ){
|
||||||
|
uint64_t *ubuf = (uint64_t *)buf;
|
||||||
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
XmitLogVector.push_back(_xor);
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(XmitLoggingCounter < XmitLogVector.size()){
|
||||||
|
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
XmitLoggingCounter,XmitLogVector.size(),
|
||||||
|
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( XmitLogVector.size()==XmitLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
|
||||||
|
}
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
uint64_t word = 1;
|
||||||
|
deviceVector<uint64_t> dev(1);
|
||||||
|
acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||||
|
{
|
||||||
|
if ( ChecksumComms ){
|
||||||
|
uint64_t *ubuf = (uint64_t *)buf;
|
||||||
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
RecvLogVector.push_back(_xor);
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(RecvLoggingCounter < RecvLogVector.size()){
|
||||||
|
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
RecvLoggingCounter,RecvLogVector.size(),
|
||||||
|
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( RecvLogVector.size()==RecvLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
|
||||||
|
}
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
43
Grid/util/FlightRecorder.h
Normal file
43
Grid/util/FlightRecorder.h
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
class FlightRecorder {
|
||||||
|
public:
|
||||||
|
enum LoggingMode_t {
|
||||||
|
LoggingModeNone,
|
||||||
|
LoggingModePrint,
|
||||||
|
LoggingModeRecord,
|
||||||
|
LoggingModeVerify
|
||||||
|
};
|
||||||
|
|
||||||
|
static int LoggingMode;
|
||||||
|
static uint64_t ErrorCounter;
|
||||||
|
static int32_t XmitLoggingCounter;
|
||||||
|
static int32_t RecvLoggingCounter;
|
||||||
|
static int32_t CsumLoggingCounter;
|
||||||
|
static int32_t NormLoggingCounter;
|
||||||
|
static int32_t ReductionLoggingCounter;
|
||||||
|
static std::vector<uint64_t> XmitLogVector;
|
||||||
|
static std::vector<uint64_t> RecvLogVector;
|
||||||
|
static std::vector<uint64_t> CsumLogVector;
|
||||||
|
static std::vector<double> NormLogVector;
|
||||||
|
static std::vector<double> ReductionLogVector;
|
||||||
|
static int ContinueOnFail;
|
||||||
|
static int PrintEntireLog;
|
||||||
|
static int ChecksumComms;
|
||||||
|
static int ChecksumCommsSend;
|
||||||
|
static void SetLoggingModePrint(void);
|
||||||
|
static void SetLoggingModeRecord(void);
|
||||||
|
static void SetLoggingModeVerify(void);
|
||||||
|
static void SetLoggingMode(LoggingMode_t mode);
|
||||||
|
static void NormLog(double value);
|
||||||
|
static void CsumLog(uint64_t csum);
|
||||||
|
static void ReductionLog(double lcl, double glbl);
|
||||||
|
static void Truncate(void);
|
||||||
|
static void ResetCounters(void);
|
||||||
|
static uint64_t ErrorCount(void);
|
||||||
|
static void xmitLog(void *,uint64_t bytes);
|
||||||
|
static void recvLog(void *,uint64_t bytes,int rank);
|
||||||
|
};
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef HOST_NAME_MAX
|
||||||
|
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -90,7 +94,12 @@ int GridThread::_threads =1;
|
|||||||
int GridThread::_hyperthreads=1;
|
int GridThread::_hyperthreads=1;
|
||||||
int GridThread::_cores=1;
|
int GridThread::_cores=1;
|
||||||
|
|
||||||
|
char hostname[HOST_NAME_MAX+1];
|
||||||
|
|
||||||
|
char *GridHostname(void)
|
||||||
|
{
|
||||||
|
return hostname;
|
||||||
|
}
|
||||||
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
||||||
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
||||||
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
||||||
@ -394,7 +403,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
|
||||||
char hostname[HOST_NAME_MAX+1];
|
|
||||||
gethostname(hostname, HOST_NAME_MAX+1);
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
||||||
|
|
||||||
|
@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
void Grid_init(int *argc,char ***argv);
|
void Grid_init(int *argc,char ***argv);
|
||||||
void Grid_finalize(void);
|
void Grid_finalize(void);
|
||||||
|
|
||||||
|
char * GridHostname(void);
|
||||||
|
|
||||||
// internal, controled with --handle
|
// internal, controled with --handle
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
void Grid_debug_handler_init(void);
|
void Grid_debug_handler_init(void);
|
||||||
@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
void printHash(void);
|
void printHash(void);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#ifndef GRID_UTIL_H
|
#pragma once
|
||||||
#define GRID_UTIL_H
|
|
||||||
#include <Grid/util/Coordinate.h>
|
#include <Grid/util/Coordinate.h>
|
||||||
#include <Grid/util/Lexicographic.h>
|
#include <Grid/util/Lexicographic.h>
|
||||||
#include <Grid/util/Init.h>
|
#include <Grid/util/Init.h>
|
||||||
#endif
|
#include <Grid/util/FlightRecorder.h>
|
||||||
|
|
||||||
|
21
configure.ac
21
configure.ac
@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
||||||
AC_ARG_ENABLE([accelerator-cshift],
|
AC_ARG_ENABLE([accelerator-aware-mpi],
|
||||||
[AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
|
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
|
||||||
[ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
|
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
|
||||||
|
|
||||||
AC_ARG_ENABLE([ucx-buggy],
|
case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||||
[AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
|
|
||||||
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
|
|
||||||
|
|
||||||
case ${ac_UCXBUGGY} in
|
|
||||||
yes)
|
yes)
|
||||||
ac_ACC_CSHIFT=no;;
|
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
|
||||||
*);;
|
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
|
||||||
esac
|
|
||||||
|
|
||||||
case ${ac_ACC_CSHIFT} in
|
|
||||||
yes)
|
|
||||||
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
|
|
||||||
*);;
|
*);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
TOOLS=$HOME/tools
|
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-accelerator-cshift \
|
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=sycl \
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=no\
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
|
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
||||||
|
|
||||||
|
2
systems/Aurora/sourceme-sunspot-deterministic.sh
Normal file
2
systems/Aurora/sourceme-sunspot-deterministic.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
41
systems/Aurora/tests/repro128.pbs
Normal file
41
systems/Aurora/tests/repro128.pbs
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
|
#PBS -q EarlyAppAccess
|
||||||
|
#PBS -l select=128
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=3
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 16 nodes, 192 ranks
|
||||||
|
# 12 ppn, 128 nodes, 1536 ranks
|
||||||
|
CMD="mpiexec -np 1536 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
|
||||||
|
$CMD
|
@ -2,26 +2,39 @@
|
|||||||
|
|
||||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
#PBS -q EarlyAppAccess
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
#PBS -l select=16
|
|
||||||
#PBS -l walltime=01:00:00
|
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=01:00:00
|
||||||
|
#PBS -N dwf
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
#export OMP_PROC_BIND=spread
|
#export OMP_PROC_BIND=spread
|
||||||
#unset OMP_PLACES
|
#unset OMP_PLACES
|
||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
#source ../sourceme.sh
|
||||||
|
|
||||||
cat $PBS_NODEFILE
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
#export MPICH_COLL_SYNC=1
|
||||||
|
#export MPICH_ENV_DISPLAY=1
|
||||||
|
export MPICH_
|
||||||
export OMP_NUM_THREADS=3
|
export OMP_NUM_THREADS=3
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
|||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
# 12 ppn, 16 nodes, 192 ranks
|
DIR=repro.$PBS_JOBID
|
||||||
|
mkdir $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
CMD="mpiexec -np 192 -ppn 12 -envall \
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
../gpu_tile_compact.sh \
|
||||||
./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
|
../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
|
||||||
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
|
||||||
|
#--comms-overlap
|
||||||
$CMD
|
$CMD
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
||||||
|
82
systems/Aurora/tests/repro1gpu.pbs
Normal file
82
systems/Aurora/tests/repro1gpu.pbs
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N repro1gpu
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
|
||||||
|
for g in {0..11}
|
||||||
|
do
|
||||||
|
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||||
|
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||||
|
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||||
|
|
||||||
|
export numa=${NUMA_MAP[$g]}
|
||||||
|
export gpu_id=${GPU_MAP[$g]}
|
||||||
|
export tile_id=${TILE_MAP[$g]}
|
||||||
|
export gpu=$gpu_id.$tile_id
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
echo $gpu > gpu
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=$gpu
|
||||||
|
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
|
||||||
|
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
|
||||||
|
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
|
||||||
|
echo $CMD
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
98
systems/Aurora/tests/reproN.pbs
Normal file
98
systems/Aurora/tests/reproN.pbs
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=32:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N reproN
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
|
export GRID_CHECKSUM_RECV_BUF=0
|
||||||
|
export GRID_CHECKSUM_SEND_BUF=0
|
||||||
|
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
|
||||||
|
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
# ../../gpu_tile_compact.sh \
|
||||||
|
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
../../gpu_tile_compact.sh \
|
||||||
|
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
echo $CMD > command-line
|
||||||
|
env > environment
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# Suspicious wait is allowing jobs to collide and knock out
|
||||||
|
#wait
|
||||||
|
|
||||||
|
sleep 6500
|
||||||
|
|
||||||
|
for n in ` eval echo {1..$NN} `
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
||||||
|
done
|
@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
CMD="mpiexec -np 192 -ppn 12 -envall \
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
|
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
|
||||||
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
|
||||||
$CMD
|
$CMD
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
TOOLS=$HOME/tools
|
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
@ -11,6 +11,6 @@ TOOLS=$HOME/tools
|
|||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
|
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
||||||
|
|
||||||
|
2
systems/Sunspot/sourceme.sh
Normal file
2
systems/Sunspot/sourceme.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
81
systems/Sunspot/tests/repro1gpu.pbs
Normal file
81
systems/Sunspot/tests/repro1gpu.pbs
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N repro1gpu
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
|
||||||
|
for g in {0..11}
|
||||||
|
do
|
||||||
|
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||||
|
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||||
|
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||||
|
|
||||||
|
export numa=${NUMA_MAP[$g]}
|
||||||
|
export gpu_id=${GPU_MAP[$g]}
|
||||||
|
export tile_id=${TILE_MAP[$g]}
|
||||||
|
export gpu=$gpu_id.$tile_id
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
echo $gpu > gpu
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=$gpu
|
||||||
|
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
|
||||||
|
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
|
||||||
|
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
|
||||||
|
echo $CMD
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
97
systems/Sunspot/tests/reproN.pbs
Normal file
97
systems/Sunspot/tests/reproN.pbs
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=32:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N reproN
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
|
|
||||||
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
|
export GRID_CHECKSUM_RECV_BUF=1
|
||||||
|
export GRID_CHECKSUM_SEND_BUF=0
|
||||||
|
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
|
||||||
|
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
# ../../gpu_tile_compact.sh \
|
||||||
|
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
../../gpu_tile_compact.sh \
|
||||||
|
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
echo $CMD > command-line
|
||||||
|
env > environment
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# Suspicious wait is allowing jobs to collide and knock out
|
||||||
|
#wait
|
||||||
|
|
||||||
|
sleep 6500
|
||||||
|
|
||||||
|
for n in ` eval echo {1..$NN} `
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
||||||
|
done
|
@ -30,6 +30,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
|
#ifndef HOST_NAME_MAX
|
||||||
|
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase<Field> {
|
||||||
|
public:
|
||||||
|
Matrix &_Mat;
|
||||||
|
SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){};
|
||||||
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
tmp.Checkerboard() = !in.Checkerboard();
|
||||||
|
// std::cout <<" Mpc starting"<<std::endl;
|
||||||
|
|
||||||
|
RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.Meooe(in,tmp);
|
||||||
|
nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl;
|
||||||
|
_Mat.MooeeInv(tmp,out);
|
||||||
|
nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl;
|
||||||
|
_Mat.Mooee(in,out);
|
||||||
|
nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl;
|
||||||
|
axpy(out,-1.0,tmp,out);
|
||||||
|
}
|
||||||
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
// std::cout <<" MpcDag starting"<<std::endl;
|
||||||
|
RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.MeooeDag(in,tmp);
|
||||||
|
_Mat.MooeeInvDag(tmp,out);
|
||||||
|
nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.MeooeDag(out,tmp);
|
||||||
|
nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl;
|
||||||
|
_Mat.MooeeDag(in,out);
|
||||||
|
nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl;
|
||||||
|
axpy(out,-1.0,tmp,out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
char hostname[HOST_NAME_MAX+1];
|
char hostname[HOST_NAME_MAX+1];
|
||||||
@ -78,8 +122,8 @@ int main (int argc, char ** argv)
|
|||||||
result_o_2.Checkerboard() = Odd;
|
result_o_2.Checkerboard() = Odd;
|
||||||
result_o_2 = Zero();
|
result_o_2 = Zero();
|
||||||
|
|
||||||
SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
|
SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
|
||||||
SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
|
SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
|
||||||
|
|
||||||
int nsecs=600;
|
int nsecs=600;
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
|
||||||
@ -100,10 +144,22 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
time_t start = time(NULL);
|
time_t start = time(NULL);
|
||||||
|
|
||||||
uint32_t csum, csumref;
|
FlightRecorder::ContinueOnFail = 0;
|
||||||
csumref=0;
|
FlightRecorder::PrintEntireLog = 0;
|
||||||
|
FlightRecorder::ChecksumComms = 1;
|
||||||
|
FlightRecorder::ChecksumCommsSend=0;
|
||||||
|
|
||||||
|
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
|
||||||
|
if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s);
|
||||||
|
if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s);
|
||||||
|
|
||||||
int iter=0;
|
int iter=0;
|
||||||
do {
|
do {
|
||||||
|
if ( iter == 0 ) {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
|
||||||
|
}
|
||||||
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
||||||
result_o = Zero();
|
result_o = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -114,27 +170,23 @@ int main (int argc, char ** argv)
|
|||||||
flops+= CGsiteflops*FrbGrid->gSites()*iters;
|
flops+= CGsiteflops*FrbGrid->gSites()*iters;
|
||||||
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
||||||
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
||||||
|
std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||||
|
|
||||||
csum = crc(result_o);
|
assert(FlightRecorder::ErrorCount()==0);
|
||||||
|
|
||||||
if ( csumref == 0 ) {
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
csumref = csum;
|
|
||||||
} else {
|
|
||||||
if ( csum != csumref ) {
|
|
||||||
std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
} else {
|
|
||||||
std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
iter ++;
|
iter ++;
|
||||||
} while (time(NULL) < (start + nsecs/2) );
|
} while (time(NULL) < (start + nsecs/10) );
|
||||||
|
|
||||||
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
||||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||||
csumref=0;
|
|
||||||
int i=0;
|
int i=0;
|
||||||
do {
|
do {
|
||||||
|
if ( i == 0 ) {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
|
||||||
|
}
|
||||||
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
||||||
result_o_2 = Zero();
|
result_o_2 = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -146,19 +198,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
||||||
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
||||||
|
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||||
csum = crc(result_o);
|
assert(FlightRecorder::ErrorCount()==0);
|
||||||
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
if ( csumref == 0 ) {
|
|
||||||
csumref = csum;
|
|
||||||
} else {
|
|
||||||
if ( csum != csumref ) {
|
|
||||||
std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
} else {
|
|
||||||
std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i++;
|
i++;
|
||||||
} while (time(NULL) < (start + nsecs) );
|
} while (time(NULL) < (start + nsecs) );
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user