mirror of
https://github.com/paboyle/Grid.git
synced 2025-12-04 13:24:40 +00:00
Compare commits
5 Commits
develop
...
feature/ve
| Author | SHA1 | Date | |
|---|---|---|---|
| 7132a4fd28 | |||
| e8057d6b4a | |||
| 973584e039 | |||
|
|
ea46c2dc3c | ||
|
|
50bcd76fc1 |
@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
srq.PacketType = InterNodeRecv;
|
||||
srq.bytes = rbytes;
|
||||
srq.req = rrq;
|
||||
srq.dest = from;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
srq.tag = tag;
|
||||
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.dest = from;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
@@ -817,12 +818,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||
GRID_ASSERT(ierr==0);
|
||||
}
|
||||
|
||||
|
||||
// for(int r=0;r<nreq;r++){
|
||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
// Get global error count in comms receive
|
||||
#define AUDIT_FLIGHT_RECORDER_ERRORS
|
||||
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
|
||||
uint64_t EC = FlightRecorder::CommsErrorCount();
|
||||
if (EC) std::cerr << " global sum error count "<<EC<<std::endl;
|
||||
this->GlobalSum(EC);
|
||||
if (EC) {
|
||||
for(int r=0;r<list.size();r++){
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
uint64_t rbytes_data = list[r].bytes - 8;
|
||||
#else
|
||||
uint64_t rbytes_data = list[r].bytes;
|
||||
#endif
|
||||
if (list[r].PacketType == InterNodeReceiveHtoD) {
|
||||
std::cerr << " calling xor reduce "<<std::endl;
|
||||
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||
} if (list[r].PacketType == InterNodeXmitISend ) {
|
||||
std::cerr << " calling xor reduce "<<std::endl;
|
||||
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
for(int r=0;r<list.size();r++){
|
||||
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
||||
|
||||
@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
Word ret = 0;
|
||||
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word csum=0;
|
||||
for(uint64_t w=0;w<L;w++){
|
||||
csum = csum ^ vec[w];
|
||||
}
|
||||
return csum;
|
||||
}
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
gpu_xor(vec,L);
|
||||
}
|
||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
|
||||
@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
|
||||
int32_t FlightRecorder::NormLoggingCounter;
|
||||
int32_t FlightRecorder::ReductionLoggingCounter;
|
||||
uint64_t FlightRecorder::ErrorCounter;
|
||||
uint64_t FlightRecorder::CommsErrorCounter;
|
||||
|
||||
std::vector<double> FlightRecorder::NormLogVector;
|
||||
std::vector<double> FlightRecorder::ReductionLogVector;
|
||||
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
|
||||
ResetCounters();
|
||||
LoggingMode = LoggingModeVerify;
|
||||
}
|
||||
uint64_t FlightRecorder::CommsErrorCount(void)
|
||||
{
|
||||
return CommsErrorCounter;
|
||||
}
|
||||
uint64_t FlightRecorder::ErrorCount(void)
|
||||
{
|
||||
return ErrorCounter;
|
||||
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||
|
||||
ErrorCounter++;
|
||||
|
||||
} else {
|
||||
if ( PrintEntireLog ) {
|
||||
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
||||
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
|
||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||
|
||||
CommsErrorCounter++;
|
||||
ErrorCounter++;
|
||||
|
||||
} else {
|
||||
if ( PrintEntireLog ) {
|
||||
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
||||
|
||||
@@ -9,8 +9,8 @@ class FlightRecorder {
|
||||
LoggingModeRecord,
|
||||
LoggingModeVerify
|
||||
};
|
||||
|
||||
static int LoggingMode;
|
||||
static uint64_t CommsErrorCounter;
|
||||
static uint64_t ErrorCounter;
|
||||
static const char * StepName;
|
||||
static int32_t StepLoggingCounter;
|
||||
@@ -39,6 +39,7 @@ class FlightRecorder {
|
||||
static void Truncate(void);
|
||||
static void ResetCounters(void);
|
||||
static uint64_t ErrorCount(void);
|
||||
static uint64_t CommsErrorCount(void);
|
||||
static void xmitLog(void *,uint64_t bytes);
|
||||
static void recvLog(void *,uint64_t bytes,int rank);
|
||||
};
|
||||
|
||||
@@ -51,7 +51,7 @@ EOF
|
||||
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
|
||||
../gpu_tile.sh \
|
||||
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
|
||||
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message "
|
||||
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message --debug-stdout "
|
||||
|
||||
echo $CMD > command-line
|
||||
env > environment
|
||||
|
||||
62
systems/Aurora/tests/reproBigJob256.pbs
Normal file
62
systems/Aurora/tests/reproBigJob256.pbs
Normal file
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
#PBS -l select=256
|
||||
#PBS -q run_next
|
||||
##PBS -A LatticeQCD_aesp_CNDA
|
||||
#PBS -A LatticeFlavor
|
||||
#PBS -l walltime=06:00:00
|
||||
#PBS -N reproBigJob
|
||||
#PBS -k doe
|
||||
#PBS -l filesystems=flare
|
||||
#PBS -l filesystems=home
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
|
||||
# 56 cores / 6 threads ~9
|
||||
export OMP_NUM_THREADS=6
|
||||
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
|
||||
export GRID_PRINT_ENTIRE_LOG=0
|
||||
export GRID_CHECKSUM_RECV_BUF=1
|
||||
export GRID_CHECKSUM_SEND_BUF=1
|
||||
|
||||
export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
||||
export MPIR_CVAR_NOLOCAL=1
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cp $PBS_NODEFILE nodefile
|
||||
|
||||
DIR=reproBigJob.$PBS_JOBID
|
||||
|
||||
mkdir -p $DIR
|
||||
cd $DIR
|
||||
|
||||
cp $PBS_NODEFILE nodefile
|
||||
|
||||
BINARY=../Test_dwf_mixedcg_prec
|
||||
|
||||
echo > pingjob <<EOF
|
||||
while read node ;
|
||||
do
|
||||
echo ssh $node killall -HUP Test_dwf_mixedcg_prec
|
||||
done < nodefile
|
||||
EOF
|
||||
|
||||
CMD="mpiexec -np 3072 -ppn 12 -envall --hostfile nodefile \
|
||||
../gpu_tile.sh \
|
||||
$BINARY --mpi 8.8.8.6 --grid 128.128.128.288 \
|
||||
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --log Message --debug-stdout "
|
||||
|
||||
echo $CMD > command-line
|
||||
env > environment
|
||||
$CMD
|
||||
grep Oops */Grid.stderr.* > failures.$PBS_JOBID
|
||||
|
||||
12
systems/mac-arm/config-command
Normal file
12
systems/mac-arm/config-command
Normal file
@@ -0,0 +1,12 @@
|
||||
CXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ ../../configure \
|
||||
--enable-simd=GEN \
|
||||
--enable-comms=mpi3 \
|
||||
--enable-debug \
|
||||
--enable-unified=yes \
|
||||
--prefix /Users/peterboyle/QCD/BugHunt/install \
|
||||
--with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
|
||||
--with-openssl=$BREW \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
--enable-debug
|
||||
|
||||
@@ -82,6 +82,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::cout << GridLogMessage<<" in main(): Grid is initialised"<<std::endl;
|
||||
const int Ls=12;
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||
@@ -94,6 +95,7 @@ int main (int argc, char ** argv)
|
||||
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
|
||||
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
|
||||
|
||||
std::cout << GridLogMessage<<" in main(): making RNGs"<<std::endl;
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
@@ -152,7 +154,7 @@ int main (int argc, char ** argv)
|
||||
time_t start = time(NULL);
|
||||
UGrid->Broadcast(0,(void *)&start,sizeof(start));
|
||||
|
||||
FlightRecorder::ContinueOnFail = 0;
|
||||
FlightRecorder::ContinueOnFail = 1;
|
||||
FlightRecorder::PrintEntireLog = 0;
|
||||
FlightRecorder::ChecksumComms = 0;
|
||||
FlightRecorder::ChecksumCommsSend=0;
|
||||
|
||||
Reference in New Issue
Block a user