diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 29f0ec4b..2cbc895c 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -29,7 +29,7 @@ Author: Peter Boyle #define _GRID_FFT_H_ #ifdef HAVE_FFTW -#ifdef USE_MKL +#if defined(USE_MKL) || defined(GRID_SYCL) #include #else #include diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index ec004a9b..5fa70da4 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -386,6 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, return offbytes; } +#undef NVLINK_GET // Define to use get instead of put DMA double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest,int dox, @@ -418,9 +419,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); +#endif } if (dox) { + // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); @@ -428,9 +435,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); +#endif + } } @@ -440,6 +450,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index ec42dd87..3445b077 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc; uint64_t GlobalSharedMemory::_ShmAllocBytes; std::vector GlobalSharedMemory::WorldShmCommBufs; +#ifndef ACCELERATOR_AWARE_MPI +void * GlobalSharedMemory::HostCommBuf; +#endif Grid_MPI_Comm GlobalSharedMemory::WorldShmComm; int GlobalSharedMemory::WorldShmRank; @@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void) ///////////////////////////////// // Alloc, free shmem region ///////////////////////////////// +#ifndef ACCELERATOR_AWARE_MPI +void *SharedMemory::HostBufferMalloc(size_t bytes){ + void *ptr = (void *)host_heap_top; + host_heap_top += bytes; + host_heap_bytes+= bytes; + if (host_heap_bytes >= host_heap_size) { + std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm flag" < WorldShmCommBufs; - +#ifndef ACCELERATOR_AWARE_MPI + static void *HostCommBuf; +#endif static Grid_MPI_Comm WorldComm; static int WorldRank; static int WorldSize; @@ -120,6 +122,13 @@ private: size_t heap_bytes; size_t heap_size; +#ifndef ACCELERATOR_AWARE_MPI + size_t host_heap_top; // set in free all + size_t host_heap_bytes;// set in free all + void *HostCommBuf; // set in SetCommunicator + size_t host_heap_size; // set in SetCommunicator +#endif + protected: Grid_MPI_Comm ShmComm; // for barriers @@ -151,7 +160,10 @@ public: void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferMalloc(size_t bytes); void ShmBufferFreeAll(void) ; - +#ifndef ACCELERATOR_AWARE_MPI + void *HostBufferMalloc(size_t bytes); + void HostBufferFreeAll(void); +#endif ////////////////////////////////////////////////////////////////////////// // Make info on Nodes & ranks and Shared memory available ////////////////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 64a86c4b..2600ce9c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -39,9 +39,11 @@ Author: Christoph Lehner #include #endif #ifdef GRID_SYCL +#ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC +#define SHM_SOCKETS +#endif #include -#define SHM_SOCKETS #endif #include @@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// #if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) - -//if defined(GRID_SYCL) -#if 0 -void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) -{ - void * ShmCommBuf ; - assert(_ShmSetup==1); - assert(_ShmAlloc==0); - - ////////////////////////////////////////////////////////////////////////////////////////////////////////// - // allocate the pointer array for shared windows for our group - ////////////////////////////////////////////////////////////////////////////////////////////////////////// - MPI_Barrier(WorldShmComm); - WorldShmCommBufs.resize(WorldShmSize); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Each MPI rank should allocate our own buffer - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - ShmCommBuf = acceleratorAllocDevice(bytes); - - if (ShmCommBuf == (void *)NULL ) { - std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; - exit(EXIT_FAILURE); - } - - std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes - << "bytes at "<< std::hex<< ShmCommBuf < &left,const Lattice & return nrm; } + template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); + +#ifdef GRID_SYCL + uint64_t csum=0; + if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) + { + // Hack + // Fast integer xor checksum. Can also be used in comms now. + autoView(l_v,left,AcceleratorRead); + Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); + uint64_t *base= (uint64_t *)&l_v[0]; + csum=svm_xor(base,words); + } + FlightRecorder::CsumLog(csum); +#endif ComplexD nrm = rankInnerProduct(left,right); - // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); + FlightRecorder::ReductionLog(local,real(nrm)); return nrm; } diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 90980c4c..8395eb7c 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite return result; } -NAMESPACE_END(Grid); -/* -template Double svm_reduce(Double *vec,uint64_t L) +template Word svm_xor(Word *vec,uint64_t L) { - Double sumResult; zeroit(sumResult); - Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); - Double identity; zeroit(identity); + Word xorResult; xorResult = 0; + Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); + Word identity; identity=0; theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); + auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); cgh.parallel_for(cl::sycl::range<1>{L}, Reduction, [=] (cl::sycl::id<1> index, auto &sum) { - sum +=vec[index]; + sum ^=vec[index]; }); }); theGridAccelerator->wait(); - Double ret = d_sum[0]; + Word ret = d_sum[0]; free(d_sum,*theGridAccelerator); - std::cout << " svm_reduce finished "< &seeds){ + void SeedFixedIntegers(const std::vector &seeds, int britney=0){ // Everyone generates the same seed_seq based on input seeds CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); @@ -433,7 +433,6 @@ public: // MT implementation does not implement fast discard even though // in principle this is possible //////////////////////////////////////////////// -#if 1 thread_for( lidx, _grid->lSites(), { int64_t gidx; @@ -454,29 +453,12 @@ public: int l_idx=generator_idx(o_idx,i_idx); _generators[l_idx] = master_engine; - Skip(_generators[l_idx],gidx); // Skip to next RNG sequence - }); -#else - // Everybody loops over global volume. - thread_for( gidx, _grid->_gsites, { - - // Where is it? - int rank; - int o_idx; - int i_idx; - - Coordinate gcoor; - _grid->GlobalIndexToGlobalCoor(gidx,gcoor); - _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); - - // If this is one of mine we take it - if( rank == _grid->ThisRank() ){ - int l_idx=generator_idx(o_idx,i_idx); - _generators[l_idx] = master_engine; + if ( britney ) { + Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence + } else { Skip(_generators[l_idx],gidx); // Skip to next RNG sequence } }); -#endif #else //////////////////////////////////////////////////////////////// // Machine and thread decomposition dependent seeding is efficient diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 90bee389..90defc54 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -462,6 +462,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} #ifndef GRID_CUDA @@ -495,6 +496,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_CUDA diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index ef3aa821..80acb4ae 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -70,57 +70,6 @@ struct DefaultImplParams { void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, int off,std::vector > & table); -/* -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); - -template -void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) -{ - int num=table.size(); - std::pair *table_v = & table[0]; - - auto rhs_v = rhs.View(AcceleratorRead); - accelerator_forNB( i,num, vobj::Nsimd(), { - compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); - }); - rhs_v.ViewClose(); -} - -/////////////////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split with compression -/////////////////////////////////////////////////////////////////// -template -void Gather_plane_exchange_table(const Lattice &rhs, - commVector pointers, - int dimension,int plane, - int cbmask,compressor &compress,int type) __attribute__((noinline)); - -template -void Gather_plane_exchange_table(commVector >& table, - const Lattice &rhs, - std::vector &pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) -{ - assert( (table.size()&0x1)==0); - int num=table.size()/2; - int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - - auto rhs_v = rhs.View(AcceleratorRead); - auto rhs_p = &rhs_v[0]; - auto p0=&pointers[0][0]; - auto p1=&pointers[1][0]; - auto tp=&table[0]; - accelerator_forNB(j, num, vobj::Nsimd(), { - compress.CompressExchange(p0,p1, rhs_p, j, - so+tp[2*j ].second, - so+tp[2*j+1].second, - type); - }); - rhs_v.ViewClose(); -} -*/ - void DslashResetCounts(void); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashLogFull(void); @@ -258,6 +207,10 @@ public: struct Packet { void * send_buf; void * recv_buf; +#ifndef ACCELERATOR_AWARE_MPI + void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE + void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE +#endif Integer to_rank; Integer from_rank; Integer do_send; @@ -324,7 +277,7 @@ public: Vector surface_list; stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in managed memory + commVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -408,33 +361,16 @@ public: // Use OpenMP Tasks for cleaner ??? // must be called *inside* parallel region ////////////////////////////////////////// - /* - void CommunicateThreaded() - { -#ifdef GRID_OMP - int mythread = omp_get_thread_num(); - int nthreads = CartesianCommunicator::nCommThreads; -#else - int mythread = 0; - int nthreads = 1; -#endif - if (nthreads == -1) nthreads = 1; - if (mythread < nthreads) { - for (int i = mythread; i < Packets.size(); i += nthreads) { - uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes,i); - } - } - } - */ //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + // All GPU kernel tasks must complete + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -443,16 +379,54 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif + // Get comms started then run checksums + // Having this PRIOR to the dslash seems to make Sunspot work... (!) + for(int i=0;i > &reqs) { - _grid->StencilSendToRecvFromComplete(MpiReqs,0); + _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - acceleratorCopySynchronise(); + // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete + // accelerator_barrier(); _grid->StencilBarrier(); +#ifndef ACCELERATOR_AWARE_MPI +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferFreeAll(); +#endif + // run any checksums + for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { + // accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -540,10 +515,9 @@ public: compress.Point(point); HaloGatherDir(source,compress,point,face_idx); } - accelerator_barrier(); + accelerator_barrier(); // All my local gathers are complete face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); - } ///////////////////////// @@ -579,6 +553,7 @@ public: accelerator_forNB(j, words, cobj::Nsimd(), { coalescedWrite(to[j] ,coalescedRead(from [j])); }); + acceleratorFenceComputeStream(); } } @@ -669,6 +644,7 @@ public: for(int i=0;i struct -sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), - void>::type> - : public std::true_type {}; -#endif diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc new file mode 100644 index 00000000..4b8e0346 --- /dev/null +++ b/Grid/util/FlightRecorder.cc @@ -0,0 +1,339 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/Init.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); +/////////////////////////////////////////////////////// +// Grid Norm logging for repro testing +/////////////////////////////////////////////////////// +int FlightRecorder::PrintEntireLog; +int FlightRecorder::ContinueOnFail; +int FlightRecorder::LoggingMode; +int FlightRecorder::ChecksumComms; +int FlightRecorder::ChecksumCommsSend; +int32_t FlightRecorder::XmitLoggingCounter; +int32_t FlightRecorder::RecvLoggingCounter; +int32_t FlightRecorder::CsumLoggingCounter; +int32_t FlightRecorder::NormLoggingCounter; +int32_t FlightRecorder::ReductionLoggingCounter; +uint64_t FlightRecorder::ErrorCounter; +std::vector FlightRecorder::NormLogVector; +std::vector FlightRecorder::ReductionLogVector; +std::vector FlightRecorder::CsumLogVector; +std::vector FlightRecorder::XmitLogVector; +std::vector FlightRecorder::RecvLogVector; + +void FlightRecorder::ResetCounters(void) +{ + XmitLoggingCounter=0; + RecvLoggingCounter=0; + CsumLoggingCounter=0; + NormLoggingCounter=0; + ReductionLoggingCounter=0; +} +void FlightRecorder::Truncate(void) +{ + ResetCounters(); + XmitLogVector.resize(0); + RecvLogVector.resize(0); + NormLogVector.resize(0); + CsumLogVector.resize(0); + ReductionLogVector.resize(0); +} +void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) +{ + switch ( mode ) { + case LoggingModePrint: + SetLoggingModePrint(); + break; + case LoggingModeRecord: + SetLoggingModeRecord(); + break; + case LoggingModeVerify: + SetLoggingModeVerify(); + break; + case LoggingModeNone: + LoggingMode = mode; + Truncate(); + break; + default: + assert(0); + } +} + +void FlightRecorder::SetLoggingModePrint(void) +{ + std::cout << " FlightRecorder: set to print output " < " < " < "<< global < dev(1); + acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t)); + acceleratorCopySynchronise(); + MPI_Barrier(MPI_COMM_WORLD); + } +} +void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) +{ + if ( ChecksumComms ){ + uint64_t *ubuf = (uint64_t *)buf; + if(LoggingMode == LoggingModeNone) return; +#ifdef GRID_SYCL + uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); + if(LoggingMode == LoggingModePrint) { + std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor < XmitLogVector; + static std::vector RecvLogVector; + static std::vector CsumLogVector; + static std::vector NormLogVector; + static std::vector ReductionLogVector; + static int ContinueOnFail; + static int PrintEntireLog; + static int ChecksumComms; + static int ChecksumCommsSend; + static void SetLoggingModePrint(void); + static void SetLoggingModeRecord(void); + static void SetLoggingModeVerify(void); + static void SetLoggingMode(LoggingMode_t mode); + static void NormLog(double value); + static void CsumLog(uint64_t csum); + static void ReductionLog(double lcl, double glbl); + static void Truncate(void); + static void ResetCounters(void); + static uint64_t ErrorCount(void); + static void xmitLog(void *,uint64_t bytes); + static void recvLog(void *,uint64_t bytes,int rank); +}; +NAMESPACE_END(Grid); + diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index f4fb776d..3a81735d 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts) } #endif +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// @@ -90,7 +94,12 @@ int GridThread::_threads =1; int GridThread::_hyperthreads=1; int GridThread::_cores=1; +char hostname[HOST_NAME_MAX+1]; +char *GridHostname(void) +{ + return hostname; +} const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;}; const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;}; const Coordinate GridDefaultSimd(int dims,int nsimd) @@ -394,7 +403,6 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "< #include #include -#endif +#include + diff --git a/configure.ac b/configure.ac index c16d90f6..8e8d67af 100644 --- a/configure.ac +++ b/configure.ac @@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in esac ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons -AC_ARG_ENABLE([accelerator-cshift], - [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])], - [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) +AC_ARG_ENABLE([accelerator-aware-mpi], + [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], + [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) -AC_ARG_ENABLE([ucx-buggy], - [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])], - [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) - -case ${ac_UCXBUGGY} in +case ${ac_ACCELERATOR_AWARE_MPI} in yes) - ac_ACC_CSHIFT=no;; - *);; -esac - -case ${ac_ACC_CSHIFT} in - yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; + AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) + AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; *);; esac diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 689747c9..678acb4b 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,16 +1,16 @@ -TOOLS=$HOME/tools + ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ - --enable-accelerator-cshift \ --disable-gparity \ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=no\ --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" diff --git a/systems/Aurora/sourceme-sunspot-deterministic.sh b/systems/Aurora/sourceme-sunspot-deterministic.sh new file mode 100644 index 00000000..b6bbd561 --- /dev/null +++ b/systems/Aurora/sourceme-sunspot-deterministic.sh @@ -0,0 +1,2 @@ +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu diff --git a/systems/Aurora/tests/repro128.pbs b/systems/Aurora/tests/repro128.pbs new file mode 100644 index 00000000..34e2edc5 --- /dev/null +++ b/systems/Aurora/tests/repro128.pbs @@ -0,0 +1,41 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=128 +#PBS -l walltime=02:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +# 12 ppn, 128 nodes, 1536 ranks +CMD="mpiexec -np 1536 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap " +$CMD diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index 28030a3d..fa37ae09 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,26 +2,39 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -q EarlyAppAccess -#PBS -l select=16 -#PBS -l walltime=01:00:00 +#PBS -l select=16:system=sunspot,place=scatter #PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=01:00:00 +#PBS -N dwf +#PBS -k doe #export OMP_PROC_BIND=spread #unset OMP_PLACES cd $PBS_O_WORKDIR -source ../sourceme.sh +#source ../sourceme.sh cat $PBS_NODEFILE +#export MPICH_COLL_SYNC=1 +#export MPICH_ENV_DISPLAY=1 +export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu +#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 @@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU -# 12 ppn, 16 nodes, 192 ranks +DIR=repro.$PBS_JOBID +mkdir $DIR +cd $DIR + CMD="mpiexec -np 192 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" + ../gpu_tile_compact.sh \ + ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative" +#--comms-overlap $CMD + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs new file mode 100644 index 00000000..283a9343 --- /dev/null +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -0,0 +1,82 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N repro1gpu +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + + +for g in {0..11} +do +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) + +export numa=${NUMA_MAP[$g]} +export gpu_id=${GPU_MAP[$g]} +export tile_id=${TILE_MAP[$g]} +export gpu=$gpu_id.$tile_id + +cd $PBS_O_WORKDIR + +DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile +echo $gpu > gpu + +export ZE_AFFINITY_MASK=$gpu +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \ + numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" +echo $CMD +$CMD & + +done +done + +wait + diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs new file mode 100644 index 00000000..293e7ade --- /dev/null +++ b/systems/Aurora/tests/reproN.pbs @@ -0,0 +1,98 @@ +#!/bin/bash + +#PBS -l select=32:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproN +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +cd $PBS_O_WORKDIR + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile + +#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ +# ../../gpu_tile_compact.sh \ +# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ +# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ + ../../gpu_tile_compact.sh \ + ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +echo $CMD > command-line +env > environment +$CMD & + +done + +# Suspicious wait is allowing jobs to collide and knock out +#wait + +sleep 6500 + +for n in ` eval echo {1..$NN} ` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +cd $DIR + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + +done diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs index 5bfe04a6..ec38fe89 100644 --- a/systems/Aurora/tests/solver/stag16.pbs +++ b/systems/Aurora/tests/solver/stag16.pbs @@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU CMD="mpiexec -np 192 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ - --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap" $CMD diff --git a/systems/Sunspot/config-command b/systems/Sunspot/config-command index e59ef515..dbfe43c1 100644 --- a/systems/Sunspot/config-command +++ b/systems/Sunspot/config-command @@ -1,4 +1,4 @@ -TOOLS=$HOME/tools + ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" diff --git a/systems/Sunspot/sourceme.sh b/systems/Sunspot/sourceme.sh new file mode 100644 index 00000000..b6bbd561 --- /dev/null +++ b/systems/Sunspot/sourceme.sh @@ -0,0 +1,2 @@ +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu diff --git a/systems/Sunspot/tests/repro1gpu.pbs b/systems/Sunspot/tests/repro1gpu.pbs new file mode 100644 index 00000000..3b95b404 --- /dev/null +++ b/systems/Sunspot/tests/repro1gpu.pbs @@ -0,0 +1,81 @@ +#!/bin/bash + +#PBS -l select=16:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N repro1gpu +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + + +for g in {0..11} +do +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) + +export numa=${NUMA_MAP[$g]} +export gpu_id=${GPU_MAP[$g]} +export tile_id=${TILE_MAP[$g]} +export gpu=$gpu_id.$tile_id + +cd $PBS_O_WORKDIR + +DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile +echo $gpu > gpu + +export ZE_AFFINITY_MASK=$gpu +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \ + numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message" +echo $CMD +$CMD & + +done +done + +wait + diff --git a/systems/Sunspot/tests/reproN.pbs b/systems/Sunspot/tests/reproN.pbs new file mode 100644 index 00000000..fde4f3a9 --- /dev/null +++ b/systems/Sunspot/tests/reproN.pbs @@ -0,0 +1,97 @@ +#!/bin/bash + +#PBS -l select=32:system=sunspot,place=scatter +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproN +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +module load oneapi/eng-compiler/2023.05.15.003 +module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +NN=`cat $PBS_NODEFILE | wc -l` +echo $PBS_NODEFILE +cat $PBS_NODEFILE + +echo $NN nodes in node file +for n in `eval echo {1..$NN}` +do + +cd $PBS_O_WORKDIR + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +echo Node $n is $THIS_NODE + +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +mkdir -p $DIR +cd $DIR + +echo $THIS_NODE > nodefile + +#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ +# ../../gpu_tile_compact.sh \ +# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ +# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \ + ../../gpu_tile_compact.sh \ + ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +echo $CMD > command-line +env > environment +$CMD & + +done + +# Suspicious wait is allowing jobs to collide and knock out +#wait + +sleep 6500 + +for n in ` eval echo {1..$NN} ` +do + +THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 ` +DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE + +cd $DIR + +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* + +done diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index 13cc0bb6..3d21aff4 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,6 +30,50 @@ Author: Peter Boyle using namespace std; using namespace Grid; +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + + +NAMESPACE_BEGIN(Grid); +template + class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase { + public: + Matrix &_Mat; + SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){}; + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + // std::cout <<" Mpc starting"< HermOpEO(Ddwf); - SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); + SchurDiagMooeeOperatorParanoid HermOpEO(Ddwf); + SchurDiagMooeeOperatorParanoid HermOpEO_f(Ddwf_f); int nsecs=600; if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ @@ -100,10 +144,22 @@ int main (int argc, char ** argv) time_t start = time(NULL); - uint32_t csum, csumref; - csumref=0; + FlightRecorder::ContinueOnFail = 0; + FlightRecorder::PrintEntireLog = 0; + FlightRecorder::ChecksumComms = 1; + FlightRecorder::ChecksumCommsSend=0; + + if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s); + int iter=0; do { + if ( iter == 0 ) { + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); + } else { + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); + } std::cerr << "******************* SINGLE PRECISION SOLVE "<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - csumref=0; int i=0; do { + if ( i == 0 ) { + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord); + } else { + FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify); + } std::cerr << "******************* DOUBLE PRECISION SOLVE "<