Merge 461cd045c6 into 79ad567dd5

Merge branch 'develop' of https://github.com/paboyle/Grid into develop
More britney logging improvements
2025-10-31 03:54:33 +00:00 · 2024-03-21 22:50:18 +01:00 · 2024-03-19 15:43:42 +00:00 · 2024-03-19 14:36:21 +00:00 · 2024-03-19 14:28:33 +00:00 · 2024-03-13 18:18:44 -04:00
12 changed files with 511 additions and 98 deletions
--- a/Grid/algorithms/FFT.h
+++ b/Grid/algorithms/FFT.h
@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW
-#ifdef USE_MKL
+#if defined(USE_MKL) || defined(GRID_SYCL)
 #include <fftw/fftw3.h>
 #else
 #include <fftw3.h>
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -281,12 +281,17 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
  return nrm;
 }
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  uint32_t csum=0;
  //  Uint32Checksum(left,csum);
  ComplexD nrm = rankInnerProduct(left,right);
-  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
+  RealD local = real(nrm);
  GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI
  grid->GlobalSum(nrm);
  GridMPINormLog(local,real(nrm)); 
  return nrm;
 }
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -411,7 +411,7 @@ public:
      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
      SeedFixedIntegers(seeds);
    }
-  void SeedFixedIntegers(const std::vector<int> &seeds){
+  void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
    // Everyone generates the same seed_seq based on input seeds
    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@@ -428,7 +428,6 @@ public:
    // MT implementation does not implement fast discard even though
    // in principle this is possible
    ////////////////////////////////////////////////
 #if 1
    thread_for( lidx, _grid->lSites(), {
 	int gidx;
@@ -449,29 +448,12 @@ public:
 	int l_idx=generator_idx(o_idx,i_idx);
 	_generators[l_idx] = master_engine;
-	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+	if ( britney ) { 
-    });
+	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
-#else
+	} else { 	
    // Everybody loops over global volume.
    thread_for( gidx, _grid->_gsites, {
 	// Where is it?
 	int rank;
 	int o_idx;
 	int i_idx;
 	Coordinate gcoor;
 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 	// If this is one of mine we take it
 	if( rank == _grid->ThisRank() ){
 	  int l_idx=generator_idx(o_idx,i_idx);
 	  _generators[l_idx] = master_engine;
 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
 	}
    });
 #endif
 #else 
    ////////////////////////////////////////////////////////////////
    // Machine and thread decomposition dependent seeding is efficient
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)
 #include <cub/cub.cuh>
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
 }
 #endif 
-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  for (int r = 0; r<rd; r++) { 
    mysum[r] = vobj_zero; 
  }
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  // autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
  });
  for (int r = 0; r < rd; r++) {
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
  }
  theGridAccelerator->wait();
  for (int r = 0; r < rd; r++) {
    lvSum[r] = mysum[r];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
@@ -106,8 +159,12 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 	    buf[ss] = dat[ss*words+w];
    });
-    sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #if defined(GRID_CUDA) || defined(GRID_HIP)
-      
+      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    #elif defined(GRID_SYCL)
      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
    #endif
    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
    }
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 }
-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
-      sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+
      #if defined(GRID_CUDA) || defined(GRID_HIP)
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
      #elif defined (GRID_SYCL)
        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
      #endif
    }
    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
 #endif
 #if defined(GRID_SYCL)
 template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  typedef typename vobj::scalar_object sobj;
  size_t subvol_size = e1*e2;
  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
  vobj vobj_zero;
  zeroit(vobj_zero);
  commVector<vobj> reduction_buffer(rd*subvol_size);    
  auto rb_p = &reduction_buffer[0];
  autoView(Data_v, Data, AcceleratorRead);
  //prepare reduction buffer 
  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
      int n = s / e2;
      int b = s % e2;
      int so=r*ostride; // base offset for start of plane 
      int ss= so+n*stride+b;
      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
  });
  for (int r = 0; r < rd; r++) {
      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
          Reduction,
          [=](cl::sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
      });
      theGridAccelerator->wait();
      lvSum[r] = mysum[0];
  }
  free(mysum,*theGridAccelerator);
 }
 #endif
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #elif defined(GRID_SYCL)
  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -90,11 +90,139 @@ NAMESPACE_BEGIN(Grid);
 static Coordinate Grid_default_latt;
 static Coordinate Grid_default_mpi;
 ///////////////////////////////////////////////////////
 // Grid Norm logging for repro testing
 ///////////////////////////////////////////////////////
 int GridNormLoggingMode;
 int32_t GridNormLoggingCounter;
 int32_t GridMPINormLoggingCounter;
 std::vector<double> GridNormLogVector;
 std::vector<double> GridMPINormLogVector;
 std::vector<uint32_t> GridCsumLogVector;
 void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
 {
  switch ( mode ) {
  case GridNormLoggingModePrint:
    SetGridNormLoggingModePrint();
    break;
  case GridNormLoggingModeRecord:
    SetGridNormLoggingModeRecord();
    break;
  case GridNormLoggingModeVerify:
    SetGridNormLoggingModeVerify();
    break;
  case GridNormLoggingModeNone:
    GridNormLoggingMode = mode;
    GridNormLoggingCounter=0;
    GridMPINormLoggingCounter=0;
    GridNormLogVector.resize(0);
    GridCsumLogVector.resize(0);
    GridMPINormLogVector.resize(0);
    break;
  default:
    assert(0);
  }
 }
 void SetGridNormLoggingModePrint(void)
 {
  std::cout << " GridNormLogging Reproducibility logging set to print output " <<std::endl;
  GridNormLoggingCounter = 0;
  GridMPINormLoggingCounter=0;
  GridNormLogVector.resize(0);
  GridCsumLogVector.resize(0);
  GridMPINormLogVector.resize(0);
  GridNormLoggingMode = GridNormLoggingModePrint;
 }
 void SetGridNormLoggingModeRecord(void)
 {
  std::cout << " GridNormLogging Reproducibility logging set to RECORD " <<std::endl;
  GridNormLoggingCounter = 0;
  GridMPINormLoggingCounter=0;
  GridNormLogVector.resize(0);
  GridCsumLogVector.resize(0);
  GridMPINormLogVector.resize(0);
  GridNormLoggingMode = GridNormLoggingModeRecord;
 }
 void SetGridNormLoggingModeVerify(void)
 {
  std::cout << " GridNormLogging Reproducibility logging set to VERIFY " << GridNormLogVector.size()<< " log entries "<<std::endl;
  GridNormLoggingCounter = 0;
  GridMPINormLoggingCounter=0;
  GridNormLoggingMode = GridNormLoggingModeVerify;
 }
 void GridNormLog(double value,uint32_t csum)
 {
  if(GridNormLoggingMode == GridNormLoggingModePrint) {
    std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value << " csum " <<std::hex<<csum<<std::dec <<std::endl;
    GridNormLoggingCounter++;
  }
  if(GridNormLoggingMode == GridNormLoggingModeRecord) {
    GridNormLogVector.push_back(value);
    GridCsumLogVector.push_back(csum);
    GridNormLoggingCounter++;
  }
  if(GridNormLoggingMode == GridNormLoggingModeVerify) {
    assert(GridNormLoggingCounter < GridNormLogVector.size());
    if ( (value != GridNormLogVector[GridNormLoggingCounter])
 	 || (csum!=GridCsumLogVector[GridNormLoggingCounter]) ) {
      std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<GridNormLogVector[GridNormLoggingCounter] <<std::endl;
      std::cerr << " Oops got csum "<< std::hex<<csum<<" expect "<<GridCsumLogVector[GridNormLoggingCounter] <<std::endl;
      fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e %x %x\n",
 	      GridHostname(),
 	      GlobalSharedMemory::WorldShmRank,
 	      GridNormLoggingCounter,GridNormLogVector.size(),
 	      value, GridNormLogVector[GridNormLoggingCounter],
 	      csum, GridCsumLogVector[GridNormLoggingCounter]); fflush(stderr);
      assert(0); // Force takedown of job
    }
    if ( GridNormLogVector.size()==GridNormLoggingCounter ) {
      std::cout << " GridNormLogging : Verified entire sequence of "<<GridNormLoggingCounter<<" norms "<<std::endl;
    }
    GridNormLoggingCounter++;
  }
 }
 void GridMPINormLog(double local,double result)
 {
  if(GridNormLoggingMode == GridNormLoggingModePrint) {
    std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << " -> " <<result <<std::endl;
    GridMPINormLoggingCounter++;
  }
  if(GridNormLoggingMode == GridNormLoggingModeRecord) {
    std::cerr<<"GridMPINormLog RECORDING : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
    GridMPINormLogVector.push_back(result);
    GridMPINormLoggingCounter++;
  }
  if(GridNormLoggingMode == GridNormLoggingModeVerify) {
    std::cerr<<"GridMPINormLog : "<< GridMPINormLoggingCounter <<" " << std::hexfloat << local << "-> "<< result <<std::endl;
    assert(GridMPINormLoggingCounter < GridMPINormLogVector.size());
    if ( result != GridMPINormLogVector[GridMPINormLoggingCounter] ) {
      fprintf(stderr,"%s:%d MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e hist %.16e\n",
 	      GridHostname(),
 	      GlobalSharedMemory::WorldShmRank,
 	      GridMPINormLoggingCounter,GridMPINormLogVector.size(),
 	      result, local, GridMPINormLogVector[GridMPINormLoggingCounter]); fflush(stderr);
      assert(0); // Force takedown of job
    }
    if ( GridMPINormLogVector.size()==GridMPINormLoggingCounter ) {
      std::cout << " GridMPINormLogging : Verified entire sequence of "<<GridMPINormLoggingCounter<<" norms "<<std::endl;
    }
    GridMPINormLoggingCounter++;
  }
 }
 int GridThread::_threads =1;
 int GridThread::_hyperthreads=1;
 int GridThread::_cores=1;
 char hostname[HOST_NAME_MAX+1];
 char *GridHostname(void)
 {
  return hostname;
 }
 const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;};
 const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;};
 const Coordinate GridDefaultSimd(int dims,int nsimd)
@@ -397,7 +525,6 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 void Grid_init(int *argc,char ***argv);
 void Grid_finalize(void);
 char * GridHostname(void);
 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
@@ -68,5 +70,21 @@ void GridParseLayout(char **argv,int argc,
 void printHash(void);
 enum GridNormLoggingMode_t {
  GridNormLoggingModeNone,
  GridNormLoggingModePrint,
  GridNormLoggingModeRecord,
  GridNormLoggingModeVerify
 };
 //extern int GridNormLoggingMode;
 //extern int32_t GridNormLoggingCounter;
 //extern std::vector<double> GridNormLogVector;
 void SetGridNormLoggingModePrint(void);
 void SetGridNormLoggingModeRecord(void);
 void SetGridNormLoggingModeVerify(void);
 void SetGridNormLoggingMode(GridNormLoggingMode_t mode);
 void GridNormLog(double value,uint32_t csum);
 void GridMPINormLog(double lcl, double glbl);
 NAMESPACE_END(Grid);
--- a/systems/Aurora/tests/repro128.pbs
+++ b/systems/Aurora/tests/repro128.pbs
@@ -0,0 +1,41 @@
 #!/bin/bash
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 #PBS -q EarlyAppAccess
 #PBS -l select=128
 #PBS -l walltime=02:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
 source ../sourceme.sh
 cat $PBS_NODEFILE
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 # 12 ppn, 16 nodes, 192 ranks
 # 12 ppn, 128 nodes, 1536 ranks
 CMD="mpiexec -np 1536 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
 $CMD 
--- a/systems/Aurora/tests/repro16.pbs
+++ b/systems/Aurora/tests/repro16.pbs
@@ -2,26 +2,39 @@
 ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-#PBS -q EarlyAppAccess
+#PBS -l select=16:system=sunspot,place=scatter
 #PBS -l select=16
 #PBS -l walltime=01:00:00
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=01:00:00
 #PBS -N dwf
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 cd $PBS_O_WORKDIR
-source ../sourceme.sh
+#source ../sourceme.sh
 cat $PBS_NODEFILE
 #export MPICH_COLL_SYNC=1
 #export MPICH_ENV_DISPLAY=1
 export MPICH_
 export OMP_NUM_THREADS=3
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 #export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
@@ -32,9 +45,17 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
-# 12 ppn, 16 nodes, 192 ranks
+DIR=repro.$PBS_JOBID
 mkdir $DIR
 cd $DIR
 CMD="mpiexec -np 192 -ppn 12  -envall \
-	     ./gpu_tile_compact.sh \
+	     ../gpu_tile_compact.sh \
-	     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
+	     ../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
-		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
 #--comms-overlap
 $CMD 
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
--- a/systems/Aurora/tests/repro1gpu.pbs
+++ b/systems/Aurora/tests/repro1gpu.pbs
@@ -0,0 +1,81 @@
 #!/bin/bash
 #PBS -l select=16:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N repro1gpu
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 for g in {0..11}
 do
 export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
 export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
 export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
 export numa=${NUMA_MAP[$g]}
 export gpu_id=${GPU_MAP[$g]}
 export tile_id=${TILE_MAP[$g]}
 export gpu=$gpu_id.$tile_id
 cd $PBS_O_WORKDIR
 DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 echo $gpu > gpu
 export ZE_AFFINITY_MASK=$gpu
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 CMD="mpiexec -np 1 -ppn 1  -envall --hostfile nodefile \
 	     numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
 echo $CMD
 $CMD &
 done
 done
 wait
--- a/systems/Aurora/tests/reproN.pbs
+++ b/systems/Aurora/tests/reproN.pbs
@@ -0,0 +1,78 @@
 #!/bin/bash
 #PBS -l select=16:system=sunspot,place=scatter
 #PBS -A LatticeQCD_aesp_CNDA
 #PBS -l walltime=02:00:00
 #PBS -N reproN
 #PBS -k doe
 #export OMP_PROC_BIND=spread
 #unset OMP_PLACES
 module load oneapi/eng-compiler/2023.05.15.003
 module load mpich/51.2/icc-all-deterministic-pmix-gpu
 # 56 cores / 6 threads ~9
 export OMP_NUM_THREADS=6
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU
 export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 cd $PBS_O_WORKDIR
 NN=`cat $PBS_NODEFILE | wc -l`
 echo $PBS_NODEFILE
 cat $PBS_NODEFILE
 echo $NN nodes in node file
 for n in `eval echo {1..$NN}`
 do
 cd $PBS_O_WORKDIR
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 echo Node $n is $THIS_NODE
 DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
 mkdir -p $DIR
 cd $DIR
 echo $THIS_NODE > nodefile
 CMD="mpiexec -np 12 -ppn 12  -envall --hostfile nodefile \
 	     ../../gpu_tile_compact.sh \
 	     ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
 		--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
 $CMD &
 done
 wait
 for n in ` eval echo {1..$NN} `
 do
 THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
 DIR=repro.$PBS_JOBID/node-$n-$THIS_NODE
 cd $DIR
 grep Oops Grid.stderr.* > failures.$PBS_JOBID
 rm core.*
 done
--- a/systems/Aurora/tests/solver/stag16.pbs
+++ b/systems/Aurora/tests/solver/stag16.pbs
@@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU
 CMD="mpiexec -np 192 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
 	     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
-	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
+	     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
 $CMD 
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -34,6 +34,45 @@ using namespace Grid;
 #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #endif
 NAMESPACE_BEGIN(Grid);
 template<class Matrix,class Field>
  class SchurDiagMooeeOperatorParanoid :  public SchurOperatorBase<Field> {
 public:
    Matrix &_Mat;
    SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){};
    virtual  void Mpc      (const Field &in, Field &out) {
      Field tmp(in.Grid());
      tmp.Checkerboard() = !in.Checkerboard();
      //      std::cout <<" Mpc starting"<<std::endl;
      RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
      _Mat.Meooe(in,tmp);
      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl;
      _Mat.MooeeInv(tmp,out);
      nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
      _Mat.Meooe(out,tmp);
      nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl;
      _Mat.Mooee(in,out);
      nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl;
      axpy(out,-1.0,tmp,out);
    }
    virtual void MpcDag   (const Field &in, Field &out){
      Field tmp(in.Grid());
      //      std::cout <<" MpcDag starting"<<std::endl;
      RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
      _Mat.MeooeDag(in,tmp);
      _Mat.MooeeInvDag(tmp,out);
      nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
      _Mat.MeooeDag(out,tmp);
      nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl;
      _Mat.MooeeDag(in,out);
      nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl;
      axpy(out,-1.0,tmp,out);
    }
 };
 NAMESPACE_END(Grid);
 int main (int argc, char ** argv)
 {
  char hostname[HOST_NAME_MAX+1];
@@ -82,8 +121,8 @@ int main (int argc, char ** argv)
  result_o_2.Checkerboard() = Odd;
  result_o_2 = Zero();
-  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
+  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
-  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
+  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
  int nsecs=600;
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
@@ -108,6 +147,11 @@ int main (int argc, char ** argv)
  csumref=0;
  int iter=0;
  do {
    if ( iter == 0 ) {
      SetGridNormLoggingMode(GridNormLoggingModeRecord);
    } else {
      SetGridNormLoggingMode(GridNormLoggingModeVerify);
    }
    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
    result_o = Zero();
    t1=usecond();
@@ -139,6 +183,11 @@ int main (int argc, char ** argv)
  csumref=0;
  int i=0;
  do { 
    if ( i == 0 ) {
      SetGridNormLoggingMode(GridNormLoggingModeRecord);
    } else {
      SetGridNormLoggingMode(GridNormLoggingModeVerify);
    }
    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
    result_o_2 = Zero();
    t1=usecond();
Author	SHA1	Message	Date
dbollweg	6d7219b59d	Merge `461cd045c6` into `79ad567dd5`	2024-03-21 22:50:18 +01:00
Peter Boyle	79ad567dd5	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-19 15:43:42 +00:00
Peter Boyle	fab1efb48c	More britney logging improvements	2024-03-19 14:36:21 +00:00
Peter Boyle	660eb76d93	FFTW from OneAPI	2024-03-19 14:28:33 +00:00
dbollweg	461cd045c6	sliceSum cleanup	2024-03-13 18:18:44 -04:00
dbollweg	fee65d7a75	Merge branch 'paboyle:develop' into sycl_slicesum_update	2024-03-13 18:06:17 -04:00
dbollweg	31f9971dbf	avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum	2024-03-13 13:39:26 -04:00
Peter Boyle	62e7bf024a	Updated flight logging for Britney test	2024-03-12 20:10:04 +00:00
Peter Boyle	95f3d69cf9	Extra hardware test hook	2024-03-12 20:09:37 +00:00
Peter Boyle	89c0519f83	Repro test	2024-03-12 16:11:33 +00:00
Peter Boyle	2704b82084	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2024-03-12 15:16:24 +00:00
Peter Boyle	cf8632bbac	Britney test option	2024-03-12 15:15:35 +00:00
Peter Boyle	d224297972	PBS scripts	2024-03-12 15:15:16 +00:00
Peter Boyle	a4d11a630f	Merge pull request #458 from paboyle/fix/HOST_NAME_MAX fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined	2024-03-07 07:50:25 -05:00
dbollweg	d87296f3e8	Merge branch 'develop' of https://github.com/dbollweg/Grid into develop	2024-03-06 16:54:22 -05:00
dbollweg	be94cf1c6f	Fewer wait-calls in sycl slicesum	2024-03-06 16:53:13 -05:00