1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Compare commits

...

23 Commits

Author SHA1 Message Date
Peter Boyle
da59379612 Large reg file for double 2024-03-26 17:03:20 +00:00
Peter Boyle
3ef2a41518 ifdef guard ommitted 2024-03-26 14:50:32 +00:00
Peter Boyle
aa96f420c6 Acclerator ware MPI guard on the Unix domain sockets 2024-03-26 14:41:25 +00:00
Peter Boyle
49e9e4ed0e Fences 2024-03-26 14:14:06 +00:00
Peter Boyle
f7b8163016 Deterministic MPI reduce options 2024-03-26 14:11:40 +00:00
Peter Boyle
93769eacd3 Updated configure for bounce through host 2024-03-26 14:10:24 +00:00
Peter Boyle
59b0cc11df REduce the time in single 2024-03-26 00:42:40 +00:00
Peter Boyle
f32c275376 Updated config options for MPI not being aware of GPU 2024-03-26 00:42:00 +00:00
Peter Boyle
5404fc66ab Merge needs a fence on SYCL 2024-03-26 00:38:41 +00:00
Peter Boyle
1f53458af8 Options to bounce through a host buffer if
--disable-accelerator-aware-mpi
2024-03-26 00:37:19 +00:00
Peter Boyle
434c3e7f1d We have a choice of GET or PUT across NVlink 2024-03-25 14:32:44 +00:00
Peter Boyle
500b119f3d Deterministic MPI 2024-03-22 15:55:23 +00:00
Peter Boyle
4b87259c1b New config command for sunspot 2024-03-22 15:43:49 +00:00
Peter Boyle
503dec34ef This appears working now on Sunspot 2024-03-22 15:43:30 +00:00
Peter Boyle
d1e9fe50d2 Xor csum for repro testing 2024-03-22 15:42:57 +00:00
Peter Boyle
d01e5fa838 Improved FlightRecorder 2024-03-22 15:42:32 +00:00
Peter Boyle
a477c25e8c Sunspot repro tests 2024-03-22 15:42:11 +00:00
Peter Boyle
1bd20cd9e8 FlightRecorder 2024-03-22 15:40:01 +00:00
Peter Boyle
e49e95b037 Upgrade of the Britney test with flight recorder and fast xor checksum 2024-03-22 15:39:27 +00:00
Peter Boyle
6f59fed563 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:32 +00:00
Peter Boyle
60b7f6c99d Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:26 +00:00
Peter Boyle
b92dfcc8d3 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:27 +00:00
Peter Boyle
f6fd6dd053 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:01 +00:00
23 changed files with 778 additions and 344 deletions

View File

@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
return offbytes;
}
#undef NVLINK_GET // Define to use get instead of put DMA
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
#endif
}
if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
{
int nreq=list.size();
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);

View File

@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
void * GlobalSharedMemory::HostCommBuf;
#endif
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
void *SharedMemory::HostBufferMalloc(size_t bytes){
void *ptr = (void *)host_heap_top;
host_heap_top += bytes;
host_heap_bytes+= bytes;
if (host_heap_bytes >= host_heap_size) {
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
}
return ptr;
}
void SharedMemory::HostBufferFreeAll(void) {
host_heap_top =(size_t)HostCommBuf;
host_heap_bytes=0;
}
#endif
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;

View File

@ -75,7 +75,9 @@ public:
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
static void *HostCommBuf;
#endif
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
@ -120,6 +122,13 @@ private:
size_t heap_bytes;
size_t heap_size;
#ifndef ACCELERATOR_AWARE_MPI
size_t host_heap_top; // set in free all
size_t host_heap_bytes;// set in free all
void *HostCommBuf; // set in SetCommunicator
size_t host_heap_size; // set in SetCommunicator
#endif
protected:
Grid_MPI_Comm ShmComm; // for barriers
@ -151,7 +160,10 @@ public:
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
#ifndef ACCELERATOR_AWARE_MPI
void *HostBufferMalloc(size_t bytes);
void HostBufferFreeAll(void);
#endif
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////

View File

@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCL
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#endif
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes);
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#else
#ifdef GRID_MPI3_SHMMMAP
@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
ShmBufferFreeAll();
#ifndef ACCELERATOR_AWARE_MPI
host_heap_size = heap_size;
HostCommBuf= GlobalSharedMemory::HostCommBuf;
HostBufferFreeAll();
#endif
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////

View File

@ -285,13 +285,25 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
uint32_t csum=0;
// Uint32Checksum(left,csum);
#ifdef GRID_SYCL
uint64_t csum=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
// Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
}
FlightRecorder::CsumLog(csum);
#endif
ComplexD nrm = rankInnerProduct(left,right);
RealD local = real(nrm);
GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI
FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm);
GridMPINormLog(local,real(nrm));
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}

View File

@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
Word xorResult; xorResult = 0;
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
Word identity; identity=0;
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
sum ^=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
Word ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{

View File

@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#ifndef GRID_CUDA
@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v ,st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
#ifndef GRID_CUDA

View File

@ -70,57 +70,6 @@ struct DefaultImplParams {
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table);
/*
template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
{
int num=table.size();
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), {
compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
});
rhs_v.ViewClose();
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
*/
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
@ -258,6 +207,10 @@ public:
struct Packet {
void * send_buf;
void * recv_buf;
#ifndef ACCELERATOR_AWARE_MPI
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
#endif
Integer to_rank;
Integer from_rank;
Integer do_send;
@ -324,7 +277,7 @@ public:
Vector<int> surface_list;
stencilVector<StencilEntry> _entries; // Resident in managed memory
commVector<StencilEntry> _entries_device; // Resident in managed memory
commVector<StencilEntry> _entries_device; // Resident in device memory
std::vector<Packet> Packets;
std::vector<Merge> Mergers;
std::vector<Merge> MergersSHM;
@ -408,33 +361,16 @@ public:
// Use OpenMP Tasks for cleaner ???
// must be called *inside* parallel region
//////////////////////////////////////////
/*
void CommunicateThreaded()
{
#ifdef GRID_OMP
int mythread = omp_get_thread_num();
int nthreads = CartesianCommunicator::nCommThreads;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
}
}
}
*/
////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel.
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
#ifdef ACCELERATOR_AWARE_MPI
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
@ -443,16 +379,54 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#else
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
if ( Packets[i].do_send ) {
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
}
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].host_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].host_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#endif
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_send )
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
}
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
acceleratorCopySynchronise();
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
// accelerator_barrier();
_grid->StencilBarrier();
#ifndef ACCELERATOR_AWARE_MPI
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) {
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
}
}
_grid->HostBufferFreeAll();
#endif
// run any checksums
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
}
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@ -528,6 +502,7 @@ public:
template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
// accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid);
@ -540,10 +515,9 @@ public:
compress.Point(point);
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier();
accelerator_barrier(); // All my local gathers are complete
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
}
/////////////////////////
@ -579,6 +553,7 @@ public:
accelerator_forNB(j, words, cobj::Nsimd(), {
coalescedWrite(to[j] ,coalescedRead(from [j]));
});
acceleratorFenceComputeStream();
}
}
@ -669,6 +644,7 @@ public:
for(int i=0;i<dd.size();i++){
decompressor::DecompressFace(decompress,dd[i]);
}
acceleratorFenceComputeStream(); // dependent kernels
}
////////////////////////////////////////
// Set up routines
@ -1224,7 +1200,6 @@ public:
///////////////////////////////////////////////////////////
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off],
xmit_to_rank, do_send,

339
Grid/util/FlightRecorder.cc Normal file
View File

@ -0,0 +1,339 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Init.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////
// Grid Norm logging for repro testing
///////////////////////////////////////////////////////
int FlightRecorder::PrintEntireLog;
int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend;
int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter;
std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector;
std::vector<uint64_t> FlightRecorder::CsumLogVector;
std::vector<uint64_t> FlightRecorder::XmitLogVector;
std::vector<uint64_t> FlightRecorder::RecvLogVector;
void FlightRecorder::ResetCounters(void)
{
XmitLoggingCounter=0;
RecvLoggingCounter=0;
CsumLoggingCounter=0;
NormLoggingCounter=0;
ReductionLoggingCounter=0;
}
void FlightRecorder::Truncate(void)
{
ResetCounters();
XmitLogVector.resize(0);
RecvLogVector.resize(0);
NormLogVector.resize(0);
CsumLogVector.resize(0);
ReductionLogVector.resize(0);
}
void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
{
switch ( mode ) {
case LoggingModePrint:
SetLoggingModePrint();
break;
case LoggingModeRecord:
SetLoggingModeRecord();
break;
case LoggingModeVerify:
SetLoggingModeVerify();
break;
case LoggingModeNone:
LoggingMode = mode;
Truncate();
break;
default:
assert(0);
}
}
void FlightRecorder::SetLoggingModePrint(void)
{
std::cout << " FlightRecorder: set to print output " <<std::endl;
Truncate();
LoggingMode = LoggingModePrint;
}
void FlightRecorder::SetLoggingModeRecord(void)
{
std::cout << " FlightRecorder: set to RECORD " <<std::endl;
Truncate();
LoggingMode = LoggingModeRecord;
}
void FlightRecorder::SetLoggingModeVerify(void)
{
std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
ResetCounters();
LoggingMode = LoggingModeVerify;
}
uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
}
void FlightRecorder::NormLog(double value)
{
uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value);
NormLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(NormLoggingCounter < NormLogVector.size()){
uint64_t hexref = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
if(!ContinueOnFail)assert(0); // Force takedown of job
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
<<" "<<hex<<" "<<hexref
<<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
}
}
}
if ( NormLogVector.size()==NormLoggingCounter ) {
std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
}
NormLoggingCounter++;
}
}
void FlightRecorder::CsumLog(uint64_t hex)
{
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex);
CsumLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(CsumLoggingCounter < CsumLogVector.size()) {
uint64_t hexref = CsumLogVector[CsumLoggingCounter] ;
if ( hex != hexref ) {
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref);
fflush(stderr);
if(!ContinueOnFail) assert(0); // Force takedown of job
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
}
}
}
if ( CsumLogVector.size()==CsumLoggingCounter ) {
std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
}
CsumLoggingCounter++;
}
}
void FlightRecorder::ReductionLog(double local,double global)
{
uint64_t hex_l = * ( (uint64_t *)&local );
uint64_t hex_g = * ( (uint64_t *)&global );
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
ReductionLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
ReductionLogVector.push_back(global);
ReductionLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
}
}
}
if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
}
ReductionLoggingCounter++;
}
}
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
{
if ( ChecksumCommsSend ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
XmitLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
XmitLogVector.push_back(_xor);
XmitLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
}
}
}
if ( XmitLogVector.size()==XmitLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
}
XmitLoggingCounter++;
}
#endif
} else {
uint64_t word = 1;
deviceVector<uint64_t> dev(1);
acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
acceleratorCopySynchronise();
MPI_Barrier(MPI_COMM_WORLD);
}
}
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
{
if ( ChecksumComms ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
RecvLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
RecvLogVector.push_back(_xor);
RecvLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
}
}
}
if ( RecvLogVector.size()==RecvLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
}
RecvLoggingCounter++;
}
#endif
}
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,43 @@
#pragma once
NAMESPACE_BEGIN(Grid);
class FlightRecorder {
public:
enum LoggingMode_t {
LoggingModeNone,
LoggingModePrint,
LoggingModeRecord,
LoggingModeVerify
};
static int LoggingMode;
static uint64_t ErrorCounter;
static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter;
static int32_t NormLoggingCounter;
static int32_t ReductionLoggingCounter;
static std::vector<uint64_t> XmitLogVector;
static std::vector<uint64_t> RecvLogVector;
static std::vector<uint64_t> CsumLogVector;
static std::vector<double> NormLogVector;
static std::vector<double> ReductionLogVector;
static int ContinueOnFail;
static int PrintEntireLog;
static int ChecksumComms;
static int ChecksumCommsSend;
static void SetLoggingModePrint(void);
static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode);
static void NormLog(double value);
static void CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl);
static void Truncate(void);
static void ResetCounters(void);
static uint64_t ErrorCount(void);
static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank);
};
NAMESPACE_END(Grid);

View File

@ -90,129 +90,6 @@ NAMESPACE_BEGIN(Grid);
static Coordinate Grid_default_latt;
static Coordinate Grid_default_mpi;
///////////////////////////////////////////////////////
// Grid Norm logging for repro testing
///////////////////////////////////////////////////////
int GridNormLoggingMode;
int32_t GridNormLoggingCounter;
int32_t GridMPINormLoggingCounter;
std::vector<double> GridNormLogVector;
std::vector<double> GridMPINormLogVector;
std::vector<uint32_t> GridCsumLogVector;
void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
{
switch ( mode ) {
case GridNormLoggingModePrint:
SetGridNormLoggingModePrint();
break;
case GridNormLoggingModeRecord:
SetGridNormLoggingModeRecord();
break;
case GridNormLoggingModeVerify:
SetGridNormLoggingModeVerify();
break;
case GridNormLoggingModeNone:
GridNormLoggingMode = mode;
GridNormLoggingCounter=0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
break;
default:
assert(0);
}
}
void SetGridNormLoggingModePrint(void)
{
std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModePrint;
}
void SetGridNormLoggingModeRecord(void)
{
std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLogVector.resize(0);
GridCsumLogVector.resize(0);
GridMPINormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModeRecord;
}
void SetGridNormLoggingModeVerify(void)
{
std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl;
GridNormLoggingCounter = 0;
GridMPINormLoggingCounter=0;
GridNormLoggingMode = GridNormLoggingModeVerify;
}
void GridNormLog(double value,uint32_t csum)
{
if(GridNormLoggingMode == GridNormLoggingModePrint) {
std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl;
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
GridNormLogVector.push_back(value);
GridCsumLogVector.push_back(csum);
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
assert(GridNormLoggingCounter < GridNormLogVector.size());
if ( (value != GridNormLogVector[GridNormLoggingCounter])
|| (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) {
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl;
std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
GridNormLoggingCounter,GridNormLogVector.size(),
value, GridNormLogVector[GridNormLoggingCounter],
csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr);
assert(0); // Force takedown of job
}
if ( GridNormLogVector.size()==GridNormLoggingCounter ) {
std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl;
}
GridNormLoggingCounter++;
}
}
void GridMPINormLog(double local,double result)
{
if(GridNormLoggingMode == GridNormLoggingModePrint) {
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl;
GridMPINormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
GridMPINormLogVector.push_back(result);
GridMPINormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
assert(GridMPINormLoggingCounter < GridMPINormLogVector.size());
if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) {
fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
GridMPINormLoggingCounter,GridMPINormLogVector.size(),
result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr);
assert(0); // Force takedown of job
}
if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) {
std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl;
}
GridMPINormLoggingCounter++;
}
}
int GridThread::_threads =1;
int GridThread::_hyperthreads=1;
int GridThread::_cores=1;

View File

@ -70,21 +70,6 @@ void GridParseLayout(char **argv,int argc,
void printHash(void);
enum GridNormLoggingMode_t {
GridNormLoggingModeNone,
GridNormLoggingModePrint,
GridNormLoggingModeRecord,
GridNormLoggingModeVerify
};
//extern int GridNormLoggingMode;
//extern int32_t GridNormLoggingCounter;
//extern std::vector<double> GridNormLogVector;
void SetGridNormLoggingModePrint(void);
void SetGridNormLoggingModeRecord(void);
void SetGridNormLoggingModeVerify(void);
void SetGridNormLoggingMode(GridNormLoggingMode_t mode);
void GridNormLog(double value,uint32_t csum);
void GridMPINormLog(double lcl, double glbl);
NAMESPACE_END(Grid);

View File

@ -1,6 +1,6 @@
#ifndef GRID_UTIL_H
#define GRID_UTIL_H
#pragma once
#include <Grid/util/Coordinate.h>
#include <Grid/util/Lexicographic.h>
#include <Grid/util/Init.h>
#endif
#include <Grid/util/FlightRecorder.h>

View File

@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
esac
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
AC_ARG_ENABLE([accelerator-cshift],
[AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
[ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
AC_ARG_ENABLE([accelerator-aware-mpi],
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
AC_ARG_ENABLE([ucx-buggy],
[AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
case ${ac_UCXBUGGY} in
case ${ac_ACCELERATOR_AWARE_MPI} in
yes)
ac_ACC_CSHIFT=no;;
*);;
esac
case ${ac_ACC_CSHIFT} in
yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
*);;
esac

View File

@ -1,16 +1,16 @@
TOOLS=$HOME/tools
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=no\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"

View File

@ -0,0 +1,2 @@
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu

View File

@ -30,6 +30,7 @@ export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
cd $PBS_O_WORKDIR

View File

@ -1,6 +1,6 @@
#!/bin/bash
#PBS -l select=16:system=sunspot,place=scatter
#PBS -l select=32:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=02:00:00
#PBS -N reproN
@ -15,13 +15,23 @@ module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=0
export GRID_CHECKSUM_SEND_BUF=0
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
@ -46,29 +56,39 @@ cd $PBS_O_WORKDIR
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
echo Node $n is $THIS_NODE
DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
mkdir -p $DIR
cd $DIR
echo $THIS_NODE > nodefile
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
# ../../gpu_tile_compact.sh \
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
../../gpu_tile_compact.sh \
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
echo $CMD > command-line
env > environment
$CMD &
done
wait
# Suspicious wait is allowing jobs to collide and knock out
#wait
sleep 6500
for n in ` eval echo {1..$NN} `
do
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
cd $DIR

View File

@ -1,4 +1,4 @@
TOOLS=$HOME/tools
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
@ -11,6 +11,6 @@ TOOLS=$HOME/tools
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"

View File

@ -0,0 +1,2 @@
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu

View File

@ -0,0 +1,81 @@
#!/bin/bash
#PBS -l select=16:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=02:00:00
#PBS -N repro1gpu
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
cd $PBS_O_WORKDIR
NN=`cat $PBS_NODEFILE | wc -l`
echo $PBS_NODEFILE
cat $PBS_NODEFILE
echo $NN nodes in node file
for n in `eval echo {1..$NN}`
do
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
echo Node $n is $THIS_NODE
for g in {0..11}
do
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export numa=${NUMA_MAP[$g]}
export gpu_id=${GPU_MAP[$g]}
export tile_id=${TILE_MAP[$g]}
export gpu=$gpu_id.$tile_id
cd $PBS_O_WORKDIR
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
mkdir -p $DIR
cd $DIR
echo $THIS_NODE > nodefile
echo $gpu > gpu
export ZE_AFFINITY_MASK=$gpu
export ONEAPI_DEVICE_FILTER=gpu,level_zero
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
echo $CMD
$CMD &
done
done
wait

View File

@ -0,0 +1,97 @@
#!/bin/bash
#PBS -l select=32:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=02:00:00
#PBS -N reproN
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=1
export GRID_CHECKSUM_SEND_BUF=0
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
cd $PBS_O_WORKDIR
NN=`cat $PBS_NODEFILE | wc -l`
echo $PBS_NODEFILE
cat $PBS_NODEFILE
echo $NN nodes in node file
for n in `eval echo {1..$NN}`
do
cd $PBS_O_WORKDIR
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
echo Node $n is $THIS_NODE
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
mkdir -p $DIR
cd $DIR
echo $THIS_NODE > nodefile
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
# ../../gpu_tile_compact.sh \
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
../../gpu_tile_compact.sh \
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
echo $CMD > command-line
env > environment
$CMD &
done
# Suspicious wait is allowing jobs to collide and knock out
#wait
sleep 6500
for n in ` eval echo {1..$NN} `
do
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
cd $DIR
grep Oops Grid.stderr.* > failures.$PBS_JOBID
rm core.*
done

View File

@ -34,6 +34,7 @@ using namespace Grid;
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
NAMESPACE_BEGIN(Grid);
template<class Matrix,class Field>
class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase<Field> {
@ -143,14 +144,21 @@ int main (int argc, char ** argv)
time_t start = time(NULL);
uint32_t csum, csumref;
csumref=0;
FlightRecorder::ContinueOnFail = 0;
FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 1;
FlightRecorder::ChecksumCommsSend=0;
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s);
if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s);
int iter=0;
do {
if ( iter == 0 ) {
SetGridNormLoggingMode(GridNormLoggingModeRecord);
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
} else {
SetGridNormLoggingMode(GridNormLoggingModeVerify);
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
}
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
result_o = Zero();
@ -162,31 +170,22 @@ int main (int argc, char ** argv)
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
csum = crc(result_o);
assert(FlightRecorder::ErrorCount()==0);
if ( csumref == 0 ) {
csumref = csum;
} else {
if ( csum != csumref ) {
std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
assert(0);
} else {
std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
}
}
std::cout << " FlightRecorder is OK! "<<std::endl;
iter ++;
} while (time(NULL) < (start + nsecs/2) );
} while (time(NULL) < (start + nsecs/10) );
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
csumref=0;
int i=0;
do {
if ( i == 0 ) {
SetGridNormLoggingMode(GridNormLoggingModeRecord);
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
} else {
SetGridNormLoggingMode(GridNormLoggingModeVerify);
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
}
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
result_o_2 = Zero();
@ -199,19 +198,9 @@ int main (int argc, char ** argv)
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
csum = crc(result_o);
if ( csumref == 0 ) {
csumref = csum;
} else {
if ( csum != csumref ) {
std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
assert(0);
} else {
std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
}
}
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
assert(FlightRecorder::ErrorCount()==0);
std::cout << " FlightRecorder is OK! "<<std::endl;
i++;
} while (time(NULL) < (start + nsecs) );