mirror of
https://github.com/paboyle/Grid.git
synced 2026-03-04 19:46:13 +00:00
Compare commits
5 Commits
develop
...
feature/ve
| Author | SHA1 | Date | |
|---|---|---|---|
| 7132a4fd28 | |||
| e8057d6b4a | |||
| 973584e039 | |||
|
|
ea46c2dc3c | ||
|
|
50bcd76fc1 |
@@ -589,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
srq.PacketType = InterNodeRecv;
|
srq.PacketType = InterNodeRecv;
|
||||||
srq.bytes = rbytes;
|
srq.bytes = rbytes;
|
||||||
srq.req = rrq;
|
srq.req = rrq;
|
||||||
|
srq.dest = from;
|
||||||
srq.host_buf = host_recv;
|
srq.host_buf = host_recv;
|
||||||
srq.device_buf = recv;
|
srq.device_buf = recv;
|
||||||
srq.tag = tag;
|
srq.tag = tag;
|
||||||
@@ -765,7 +766,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
srq.host_buf = NULL;
|
srq.host_buf = NULL;
|
||||||
srq.device_buf = xmit;
|
srq.device_buf = xmit;
|
||||||
srq.tag = -1;
|
srq.tag = -1;
|
||||||
srq.dest = dest;
|
srq.dest = from;
|
||||||
srq.commdir = dir;
|
srq.commdir = dir;
|
||||||
list.push_back(srq);
|
list.push_back(srq);
|
||||||
}
|
}
|
||||||
@@ -817,12 +818,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||||
GRID_ASSERT(ierr==0);
|
GRID_ASSERT(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
// for(int r=0;r<nreq;r++){
|
||||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
// Get global error count in comms receive
|
||||||
|
#define AUDIT_FLIGHT_RECORDER_ERRORS
|
||||||
|
#ifdef AUDIT_FLIGHT_RECORDER_ERRORS
|
||||||
|
uint64_t EC = FlightRecorder::CommsErrorCount();
|
||||||
|
if (EC) std::cerr << " global sum error count "<<EC<<std::endl;
|
||||||
|
this->GlobalSum(EC);
|
||||||
|
if (EC) {
|
||||||
|
for(int r=0;r<list.size();r++){
|
||||||
|
#ifdef GRID_CHECKSUM_COMMS
|
||||||
|
uint64_t rbytes_data = list[r].bytes - 8;
|
||||||
|
#else
|
||||||
|
uint64_t rbytes_data = list[r].bytes;
|
||||||
|
#endif
|
||||||
|
if (list[r].PacketType == InterNodeReceiveHtoD) {
|
||||||
|
std::cerr << " calling xor reduce "<<std::endl;
|
||||||
|
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||||
|
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||||
|
std::cerr << " Packet "<<r<<" Receive from " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||||
|
} if (list[r].PacketType == InterNodeXmitISend ) {
|
||||||
|
std::cerr << " calling xor reduce "<<std::endl;
|
||||||
|
uint64_t csg = gpu_xor((uint64_t*)list[r].device_buf,rbytes_data/8);
|
||||||
|
uint64_t csh = cpu_xor((uint64_t*)list[r].host_buf,rbytes_data/8);
|
||||||
|
std::cerr << " Packet "<<r<<" Send to " <<list[r].dest<<" host csum "<<csh<<" gpu csum "<<csg<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_CHECKSUM_COMMS
|
#ifdef GRID_CHECKSUM_COMMS
|
||||||
for(int r=0;r<list.size();r++){
|
for(int r=0;r<list.size();r++){
|
||||||
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
||||||
|
|||||||
@@ -68,8 +68,7 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Word> Word gpu_xor(Word *vec,uint64_t L)
|
||||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
|
||||||
{
|
{
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
Word ret = 0;
|
Word ret = 0;
|
||||||
@@ -87,6 +86,18 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
|||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
template<class Word> Word cpu_xor(Word *vec,uint64_t L)
|
||||||
|
{
|
||||||
|
Word csum=0;
|
||||||
|
for(uint64_t w=0;w<L;w++){
|
||||||
|
csum = csum ^ vec[w];
|
||||||
|
}
|
||||||
|
return csum;
|
||||||
|
}
|
||||||
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
|
{
|
||||||
|
gpu_xor(vec,L);
|
||||||
|
}
|
||||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||||
{
|
{
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ int32_t FlightRecorder::CsumLoggingCounter;
|
|||||||
int32_t FlightRecorder::NormLoggingCounter;
|
int32_t FlightRecorder::NormLoggingCounter;
|
||||||
int32_t FlightRecorder::ReductionLoggingCounter;
|
int32_t FlightRecorder::ReductionLoggingCounter;
|
||||||
uint64_t FlightRecorder::ErrorCounter;
|
uint64_t FlightRecorder::ErrorCounter;
|
||||||
|
uint64_t FlightRecorder::CommsErrorCounter;
|
||||||
|
|
||||||
std::vector<double> FlightRecorder::NormLogVector;
|
std::vector<double> FlightRecorder::NormLogVector;
|
||||||
std::vector<double> FlightRecorder::ReductionLogVector;
|
std::vector<double> FlightRecorder::ReductionLogVector;
|
||||||
@@ -118,6 +119,10 @@ void FlightRecorder::SetLoggingModeVerify(void)
|
|||||||
ResetCounters();
|
ResetCounters();
|
||||||
LoggingMode = LoggingModeVerify;
|
LoggingMode = LoggingModeVerify;
|
||||||
}
|
}
|
||||||
|
uint64_t FlightRecorder::CommsErrorCount(void)
|
||||||
|
{
|
||||||
|
return CommsErrorCounter;
|
||||||
|
}
|
||||||
uint64_t FlightRecorder::ErrorCount(void)
|
uint64_t FlightRecorder::ErrorCount(void)
|
||||||
{
|
{
|
||||||
return ErrorCounter;
|
return ErrorCounter;
|
||||||
@@ -312,6 +317,7 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
|||||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||||
|
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if ( PrintEntireLog ) {
|
if ( PrintEntireLog ) {
|
||||||
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
||||||
@@ -357,7 +363,9 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
|||||||
|
|
||||||
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
if ( !ContinueOnFail ) GRID_ASSERT(0);
|
||||||
|
|
||||||
|
CommsErrorCounter++;
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if ( PrintEntireLog ) {
|
if ( PrintEntireLog ) {
|
||||||
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
||||||
|
|||||||
@@ -9,8 +9,8 @@ class FlightRecorder {
|
|||||||
LoggingModeRecord,
|
LoggingModeRecord,
|
||||||
LoggingModeVerify
|
LoggingModeVerify
|
||||||
};
|
};
|
||||||
|
|
||||||
static int LoggingMode;
|
static int LoggingMode;
|
||||||
|
static uint64_t CommsErrorCounter;
|
||||||
static uint64_t ErrorCounter;
|
static uint64_t ErrorCounter;
|
||||||
static const char * StepName;
|
static const char * StepName;
|
||||||
static int32_t StepLoggingCounter;
|
static int32_t StepLoggingCounter;
|
||||||
@@ -39,6 +39,7 @@ class FlightRecorder {
|
|||||||
static void Truncate(void);
|
static void Truncate(void);
|
||||||
static void ResetCounters(void);
|
static void ResetCounters(void);
|
||||||
static uint64_t ErrorCount(void);
|
static uint64_t ErrorCount(void);
|
||||||
|
static uint64_t CommsErrorCount(void);
|
||||||
static void xmitLog(void *,uint64_t bytes);
|
static void xmitLog(void *,uint64_t bytes);
|
||||||
static void recvLog(void *,uint64_t bytes,int rank);
|
static void recvLog(void *,uint64_t bytes,int rank);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ EOF
|
|||||||
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
|
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
|
||||||
../gpu_tile.sh \
|
../gpu_tile.sh \
|
||||||
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
|
$BINARY --mpi 4.4.4.6 --grid 64.64.64.96 \
|
||||||
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message "
|
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --log Message --debug-stdout "
|
||||||
|
|
||||||
echo $CMD > command-line
|
echo $CMD > command-line
|
||||||
env > environment
|
env > environment
|
||||||
|
|||||||
62
systems/Aurora/tests/reproBigJob256.pbs
Normal file
62
systems/Aurora/tests/reproBigJob256.pbs
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=256
|
||||||
|
#PBS -q run_next
|
||||||
|
##PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -A LatticeFlavor
|
||||||
|
#PBS -l walltime=06:00:00
|
||||||
|
#PBS -N reproBigJob
|
||||||
|
#PBS -k doe
|
||||||
|
#PBS -l filesystems=flare
|
||||||
|
#PBS -l filesystems=home
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
|
export GRID_CHECKSUM_RECV_BUF=1
|
||||||
|
export GRID_CHECKSUM_SEND_BUF=1
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
||||||
|
export MPIR_CVAR_NOLOCAL=1
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
|
||||||
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
|
DIR=reproBigJob.$PBS_JOBID
|
||||||
|
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
cp $PBS_NODEFILE nodefile
|
||||||
|
|
||||||
|
BINARY=../Test_dwf_mixedcg_prec
|
||||||
|
|
||||||
|
echo > pingjob <<EOF
|
||||||
|
while read node ;
|
||||||
|
do
|
||||||
|
echo ssh $node killall -HUP Test_dwf_mixedcg_prec
|
||||||
|
done < nodefile
|
||||||
|
EOF
|
||||||
|
|
||||||
|
CMD="mpiexec -np 3072 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
../gpu_tile.sh \
|
||||||
|
$BINARY --mpi 8.8.8.6 --grid 128.128.128.288 \
|
||||||
|
--shm-mpi 0 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --log Message --debug-stdout "
|
||||||
|
|
||||||
|
echo $CMD > command-line
|
||||||
|
env > environment
|
||||||
|
$CMD
|
||||||
|
grep Oops */Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
|
||||||
12
systems/mac-arm/config-command
Normal file
12
systems/mac-arm/config-command
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
CXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ ../../configure \
|
||||||
|
--enable-simd=GEN \
|
||||||
|
--enable-comms=mpi3 \
|
||||||
|
--enable-debug \
|
||||||
|
--enable-unified=yes \
|
||||||
|
--prefix /Users/peterboyle/QCD/BugHunt/install \
|
||||||
|
--with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
|
||||||
|
--with-openssl=$BREW \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--disable-gparity \
|
||||||
|
--enable-debug
|
||||||
|
|
||||||
@@ -82,6 +82,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<<" in main(): Grid is initialised"<<std::endl;
|
||||||
const int Ls=12;
|
const int Ls=12;
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||||
@@ -94,6 +95,7 @@ int main (int argc, char ** argv)
|
|||||||
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
|
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
|
||||||
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
|
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<<" in main(): making RNGs"<<std::endl;
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
@@ -152,7 +154,7 @@ int main (int argc, char ** argv)
|
|||||||
time_t start = time(NULL);
|
time_t start = time(NULL);
|
||||||
UGrid->Broadcast(0,(void *)&start,sizeof(start));
|
UGrid->Broadcast(0,(void *)&start,sizeof(start));
|
||||||
|
|
||||||
FlightRecorder::ContinueOnFail = 0;
|
FlightRecorder::ContinueOnFail = 1;
|
||||||
FlightRecorder::PrintEntireLog = 0;
|
FlightRecorder::PrintEntireLog = 0;
|
||||||
FlightRecorder::ChecksumComms = 0;
|
FlightRecorder::ChecksumComms = 0;
|
||||||
FlightRecorder::ChecksumCommsSend=0;
|
FlightRecorder::ChecksumCommsSend=0;
|
||||||
|
|||||||
Reference in New Issue
Block a user