1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-12-04 05:14:40 +00:00

Compare commits

..

11 Commits

Author SHA1 Message Date
7132a4fd28 Update 2025-12-02 23:22:42 +00:00
e8057d6b4a Updated for verbose on host vs. device side csum 2025-12-02 23:15:32 +00:00
973584e039 Update on aurora 2025-12-02 22:24:54 +00:00
Peter Boyle
ea46c2dc3c config command 2025-12-02 16:01:37 -05:00
Peter Boyle
50bcd76fc1 Changes to help with error logging on aurora -- triage MPI / Slingshot vs. host-device / SYCL on checksum error 2025-12-02 15:51:29 -05:00
4a0aaf0786 Fix issue with Aurora compilers 2025-11-21 21:41:13 +00:00
9c3835524c Fix compile warn 2025-11-21 21:41:12 +00:00
549351bb8a Stag verbose clean up 2025-11-20 18:22:57 +00:00
Peter Boyle
74e6b19f83 Looks like the reuse of xfers in staggered has bugs or corner cases depending on volume 2025-11-17 22:29:06 -05:00
Peter Boyle
2e684028de Improvements 2025-11-14 18:12:27 -05:00
c54d87a472 Aurora compile fix for new compiler 2025-11-06 18:17:33 +00:00
19 changed files with 170 additions and 35 deletions

View File

@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.dest = from;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.dest = from;
srq.commdir = dir;
list.push_back(srq);
}
@@ -817,12 +818,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
// Get global error count in comms receive
#define AUDIT_FLIGHT_RECORDER_ERRORS
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
uint64_t EC = FlightRecorder::CommsErrorCount();
if (EC) std::cerr << " global sum error count "<<EC<<std::endl;
this->GlobalSum(EC);
if (EC) {
for(int r=0;r<list.size();r++){
#ifdef GRID_CHECKSUM_COMMS
uint64_t rbytes_data = list[r].bytes - 8;
#else
uint64_t rbytes_data = list[r].bytes;
#endif
if (list[r].PacketType == InterNodeReceiveHtoD) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
} if (list[r].PacketType == InterNodeXmitISend ) {
std::cerr << " calling xor reduce "<<std::endl;
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
}
}
}
#endif
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {

View File

@@ -27,6 +27,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
void GridAbort(void) { abort(); }
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -34,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world;
void GridAbort(void) { abort(); }
void CartesianCommunicator::Init(int *argc, char *** arv)
{

View File

@@ -124,7 +124,7 @@ template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
acceleratorCopyFromDevice(send_buf,hsend_buf,bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
@@ -132,7 +132,7 @@ template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
acceleratorCopyToDevice(hrecv_buf,recv_buf,bytes);
#endif
}
}

View File

@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait();
return ret;
}
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
{
Word csum=0;
for(uint64_t w=0;w<L;w++){
csum = csum ^ vec[w];
}
return csum;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
gpu_xor(vec,L);
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;

View File

@@ -411,7 +411,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody
}
#ifdef GRID_SYCL
#if 0
extern "C" {
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );

View File

@@ -138,10 +138,13 @@ public:
//auto start = std::chrono::high_resolution_clock::now();
autoView(U_v,U,AcceleratorWrite);
autoView(P_v,P,AcceleratorRead);
accelerator_for(ss, P.Grid()->oSites(),1,{
typedef typename Field::vector_object vobj;
const int Nsimd = vobj::Nsimd();
accelerator_for(ss, P.Grid()->oSites(),Nsimd,{
for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu);
U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu));
auto tmp = Exponentiate(P_v(ss)(mu), ep, Nexp) * U_v(ss)(mu);
tmp = Group::ProjectOnGeneralGroup(tmp);
coalescedWrite(U_v[ss](mu),tmp);
}
});
//auto end = std::chrono::high_resolution_clock::now();

View File

@@ -291,8 +291,8 @@ public:
int idx=0;
for(int mu=0;mu<4;mu++){
for(int nu=0;nu<4;nu++){
if ( mu!=nu) GRID_ASSERT(this->StoutSmearing->SmearRho[idx]==rho);
else GRID_ASSERT(this->StoutSmearing->SmearRho[idx]==0.0);
if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
else assert(this->StoutSmearing->SmearRho[idx]==0.0);
idx++;
}}
//////////////////////////////////////////////////////////////////
@@ -825,6 +825,7 @@ public:
virtual void fill_smearedSet(GaugeField &U)
{
this->ThinLinks = &U; // attach the smearing routine to the field U
std::cout << GridLogMessage << " fill_smearedSet " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
// check the pointer is not null
if (this->ThinLinks == NULL)
@@ -846,6 +847,8 @@ public:
ApplyMask(smeared_A,smearLvl);
smeared_B = previous_u;
ApplyMask(smeared_B,smearLvl);
std::cout << GridLogMessage << " smeared_A " << norm2(smeared_A) << std::endl;
std::cout << GridLogMessage << " smeared_B " << norm2(smeared_B) << std::endl;
// Replace only the masked portion
this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
previous_u = this->SmearedSet[smearLvl];
@@ -934,10 +937,10 @@ public:
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
{
GRID_ASSERT(Nsmear%(2*Nd)==0); // Or multiply by 8??
assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
// was resized in base class
GRID_ASSERT(this->SmearedSet.size()==Nsmear);
assert(this->SmearedSet.size()==Nsmear);
GridRedBlackCartesian * UrbGrid;
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);

View File

@@ -54,7 +54,7 @@ public:
// Usual cases are not used
//////////////////////////////////
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ GRID_ASSERT(0);};
virtual RealD S(const GaugeField &U) { GRID_ASSERT(0); }
virtual RealD S(const GaugeField &U) { GRID_ASSERT(0); return 0; }
virtual void deriv(const GaugeField &U, GaugeField &dSdU) { GRID_ASSERT(0); }
//////////////////////////////////

View File

@@ -751,7 +751,7 @@ public:
obj.xbytes = xbytes;
obj.rbytes = rbytes;
obj.cb = cb;
for(int i=0;i<CachedTransfers.size();i++){
if ( (CachedTransfers[i].direction ==direction)
&&(CachedTransfers[i].OrthogPlane==OrthogPlane)
@@ -763,11 +763,13 @@ public:
){
// FIXME worry about duplicate with partial compression
// Wont happen as DWF has no duplicates, but...
AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
return 1;
// AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
// std::cout << "Duplicate dir " <<direction<<" "<<" OrthogPlane "<<OrthogPlane<<" Dest"<<DestProc <<" xbytes " <<xbytes<<" lane "<< lane<<" cb "<<cb<<std::endl;
return 0;
// return 1;
}
}
CachedTransfers.push_back(obj);
return 0;
}

View File

@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter;
uint64_t FlightRecorder::CommsErrorCounter;
std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector;
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
ResetCounters();
LoggingMode = LoggingModeVerify;
}
uint64_t FlightRecorder::CommsErrorCount(void)
{
return CommsErrorCounter;
}
uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if ( !ContinueOnFail ) GRID_ASSERT(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if ( !ContinueOnFail ) GRID_ASSERT(0);
CommsErrorCounter++;
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;

View File

@@ -9,8 +9,8 @@ class FlightRecorder {
LoggingModeRecord,
LoggingModeVerify
};
static int LoggingMode;
static uint64_t CommsErrorCounter;
static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
@@ -39,6 +39,7 @@ class FlightRecorder {
static void Truncate(void);
static void ResetCounters(void);
static uint64_t ErrorCount(void);
static uint64_t CommsErrorCount(void);
static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank);
};

View File

@@ -755,7 +755,7 @@ void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
sig_print_uint(si->si_code);
SIGLOG("\n");
unw_context_t *uc= (unw_context_t *)ptr;
ucontext_t *uc= (ucontext_t *)ptr;
SIGLOG("Backtrace:\n");
#ifdef HAVE_UNWIND

View File

@@ -409,16 +409,10 @@ AC_SEARCH_LIBS([unw_backtrace], [unwind],
[have_unwind=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AS_CASE([$host_cpu], [x86_64],
[AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[have_unwind_x86_64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])],
[aarch64],
[AC_SEARCH_LIBS([_Uaarch64_step], [unwind-aarch64],
[AC_DEFINE([HAVE_UNWIND_AARCH64], [1], [Define to 1 if you have the `libunwind-aarch64' library])]
[have_unwind_aarch64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])])
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[have_unwind_x86_64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([SHA256_Init], [crypto],
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]

View File

@@ -10,7 +10,7 @@ export HDF5=/opt/cray/pe/hdf5/1.12.2.3/gnu/9.1
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-checksum-comms=yes \
--enable-checksum-comms=no \
--enable-log-views=yes \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=no \

View File

@@ -51,7 +51,7 @@ EOF
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message "
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message --debug-stdout "
echo $CMD > command-line
env > environment

View File

@@ -0,0 +1,62 @@
#!/bin/bash
#PBS -l select=256
#PBS -q run_next
##PBS -A LatticeQCD_aesp_CNDA
#PBS -A LatticeFlavor
#PBS -l walltime=06:00:00
#PBS -N reproBigJob
#PBS -k doe
#PBS -l filesystems=flare
#PBS -l filesystems=home
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=1
export GRID_CHECKSUM_SEND_BUF=1
export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
export MPIR_CVAR_NOLOCAL=1
cd $PBS_O_WORKDIR
source ../sourceme.sh
cp $PBS_NODEFILE nodefile
DIR=reproBigJob.$PBS_JOBID
mkdir -p $DIR
cd $DIR
cp $PBS_NODEFILE nodefile
BINARY=../Test_dwf_mixedcg_prec
echo > pingjob <<EOF
while read node ;
do
echo ssh $node killall -HUP Test_dwf_mixedcg_prec
done < nodefile
EOF
CMD="mpiexec -np 3072 -ppn 12 -envall --hostfile nodefile \
../gpu_tile.sh \
$BINARY --mpi 8.8.8.6 --grid 128.128.128.288 \
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --log Message --debug-stdout "
echo $CMD > command-line
env > environment
$CMD
grep Oops */Grid.stderr.* > failures.$PBS_JOBID

View File

@@ -0,0 +1,12 @@
CXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ ../../configure \
--enable-simd=GEN \
--enable-comms=mpi3 \
--enable-debug \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/BugHunt/install \
--with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
--with-openssl=$BREW \
--disable-fermion-reps \
--disable-gparity \
--enable-debug

View File

@@ -82,6 +82,7 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
std::cout << GridLogMessage<<" in main(): Grid is initialised"<<std::endl;
const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
@@ -94,6 +95,7 @@ int main (int argc, char ** argv)
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
std::cout << GridLogMessage<<" in main(): making RNGs"<<std::endl;
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
@@ -152,7 +154,7 @@ int main (int argc, char ** argv)
time_t start = time(NULL);
UGrid->Broadcast(0,(void *)&start,sizeof(start));
FlightRecorder::ContinueOnFail = 0;
FlightRecorder::ContinueOnFail = 1;
FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0;

View File

@@ -62,6 +62,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
Gimpl::generate_momenta(P,sRNG,RNG4);
// Filter.applyFilter(P);
std::cout << GridLogMessage << "Initial momenta " << norm2(P) << std::endl;
action.refresh(smU,sRNG,RNG4);
@@ -70,6 +71,8 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S1 = action.S(smU);
std::cout << GridLogMessage << "Initial action " << S1 << std::endl;
Gimpl::update_field(P,U,eps);
smU.set_Field(U);
@@ -80,6 +83,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
action.deriv(smU,UdSdU);
UdSdU = Ta(UdSdU);
// Filter.applyFilter(UdSdU);
std::cout << GridLogMessage << "Derivative " << norm2(UdSdU) << std::endl;
DumpSliceNorm("Force",UdSdU,Nd-1);
@@ -91,6 +95,7 @@ void ForceTest(Action<LatticeGaugeField> &action,ConfigurationBase<LatticeGaugeF
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S2 = action.S(smU);
std::cout << GridLogMessage << "Final action " << S1 << std::endl;
// Use the derivative
LatticeComplex dS(UGrid); dS = Zero();
@@ -145,6 +150,8 @@ int main (int argc, char ** argv)
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds);
SU<Nc>::HotConfiguration(RNG4,U);
#endif
std::cout << GridLogMessage << "Initial plaquette: " << WilsonLoops<PeriodicGimplR>::avgPlaquette(U) << std::endl;
WilsonGaugeActionR PlaqAction(6.0);