1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-19 16:55:37 +01:00

Merge branch 'paboyle:develop' into sycl_slicesum_update

This commit is contained in:
dbollweg 2024-03-13 18:06:17 -04:00 committed by GitHub
commit fee65d7a75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 159 additions and 27 deletions

View File

@ -281,12 +281,14 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
return nrm;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
// std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
// GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI
grid->GlobalSum(nrm);
// GridNormLog(real(nrm));
return nrm;
}

View File

@ -411,7 +411,7 @@ public:
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@ -428,7 +428,6 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
@ -449,29 +448,12 @@ public:
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
if ( britney ) {
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
} else {
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#endif
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient

View File

@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
}
#endif
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
@ -86,11 +90,83 @@ NAMESPACE_BEGIN(Grid);
static Coordinate Grid_default_latt;
static Coordinate Grid_default_mpi;
///////////////////////////////////////////////////////
// Grid Norm logging for repro testing
///////////////////////////////////////////////////////
int GridNormLoggingMode;
int32_t GridNormLoggingCounter;
std::vector<double> GridNormLogVector;
void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
{
switch ( mode ) {
case GridNormLoggingModePrint:
SetGridNormLoggingModePrint();
break;
case GridNormLoggingModeRecord:
SetGridNormLoggingModeRecord();
break;
case GridNormLoggingModeVerify:
SetGridNormLoggingModeVerify();
break;
case GridNormLoggingModeNone:
GridNormLoggingMode = mode;
GridNormLoggingCounter=0;
GridNormLogVector.resize(0);
break;
default:
assert(0);
}
}
void SetGridNormLoggingModePrint(void)
{
GridNormLoggingCounter = 0;
GridNormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModePrint;
}
void SetGridNormLoggingModeRecord(void)
{
GridNormLoggingCounter = 0;
GridNormLogVector.resize(0);
GridNormLoggingMode = GridNormLoggingModeRecord;
}
void SetGridNormLoggingModeVerify(void)
{
GridNormLoggingCounter = 0;
GridNormLoggingMode = GridNormLoggingModeVerify;
}
void GridNormLog(double value)
{
if(GridNormLoggingMode == GridNormLoggingModePrint) {
std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value <<std::endl;
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
GridNormLogVector.push_back(value);
GridNormLoggingCounter++;
}
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
assert(GridNormLoggingCounter < GridNormLogVector.size());
if ( value != GridNormLogVector[GridNormLoggingCounter] ) {
fprintf(stderr,"%s Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e\n",GridHostname(),GridNormLoggingCounter,GridNormLogVector.size(),
value, GridNormLogVector[GridNormLoggingCounter]); fflush(stderr);
}
GridNormLoggingCounter++;
}
}
int GridThread::_threads =1;
int GridThread::_hyperthreads=1;
int GridThread::_cores=1;
char hostname[HOST_NAME_MAX+1];
char *GridHostname(void)
{
return hostname;
}
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
const Coordinate GridDefaultSimd(int dims,int nsimd)
@ -393,7 +469,6 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;

View File

@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
void Grid_init(int *argc,char ***argv);
void Grid_finalize(void);
char * GridHostname(void);
// internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);
@ -68,5 +70,20 @@ void GridParseLayout(char **argv,int argc,
void printHash(void);
enum GridNormLoggingMode_t {
GridNormLoggingModeNone,
GridNormLoggingModePrint,
GridNormLoggingModeRecord,
GridNormLoggingModeVerify
};
extern int GridNormLoggingMode;
extern int32_t GridNormLoggingCounter;
extern std::vector<double> GridNormLogVector;
void SetGridNormLoggingModePrint(void);
void SetGridNormLoggingModeRecord(void);
void SetGridNormLoggingModeVerify(void);
void SetGridNormLoggingMode(GridNormLoggingMode_t mode);
void GridNormLog(double value);
NAMESPACE_END(Grid);

View File

@ -0,0 +1,41 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=128
#PBS -l walltime=02:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 16 nodes, 192 ranks
# 12 ppn, 128 nodes, 1536 ranks
CMD="mpiexec -np 1536 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
$CMD

View File

@ -4,7 +4,7 @@
#PBS -q EarlyAppAccess
#PBS -l select=16
#PBS -l walltime=01:00:00
#PBS -l walltime=02:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
@ -36,5 +36,6 @@ export MPICH_OFI_NIC_POLICY=GPU
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 "
#--comms-overlap
$CMD

View File

@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
$CMD

View File

@ -30,6 +30,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
int main (int argc, char ** argv)
{
char hostname[HOST_NAME_MAX+1];
@ -104,6 +108,11 @@ int main (int argc, char ** argv)
csumref=0;
int iter=0;
do {
if ( iter == 0 ) {
SetGridNormLoggingMode(GridNormLoggingModeRecord);
} else {
SetGridNormLoggingMode(GridNormLoggingModeVerify);
}
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
result_o = Zero();
t1=usecond();
@ -135,6 +144,11 @@ int main (int argc, char ** argv)
csumref=0;
int i=0;
do {
if ( iter == 0 ) {
SetGridNormLoggingMode(GridNormLoggingModeRecord);
} else {
SetGridNormLoggingMode(GridNormLoggingModeVerify);
}
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
result_o_2 = Zero();
t1=usecond();