mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			17 Commits
		
	
	
		
			9fa8bd6438
			...
			a78a61d76f
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| a78a61d76f | |||
| 2eff3f34ed | |||
| 03687c1d62 | |||
| febfe4e77f | |||
| 4d1aa134b5 | |||
| 5ec879860a | |||
| b728af903c | |||
| 54f1999030 | |||
| fd58f0b669 | |||
| c5c67b706e | |||
| be7a543e2c | |||
| 68f112d576 | |||
| ec1395a304 | |||
| beb0e474ee | |||
| 2b5fdcbbc5 | |||
| 295127d456 | |||
| 7dcfb13694 | 
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
  typedef cublasHandle_t gridblasHandle_t;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  typedef cl::sycl::queue *gridblasHandle_t;
 | 
			
		||||
  typedef sycl::queue *gridblasHandle_t;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_ONE_MKL
 | 
			
		||||
  typedef cl::sycl::queue *gridblasHandle_t;
 | 
			
		||||
  typedef sycl::queue *gridblasHandle_t;
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
			
		||||
  typedef int32_t gridblasHandle_t;
 | 
			
		||||
@@ -89,9 +89,9 @@ public:
 | 
			
		||||
      gridblasHandle = theGridAccelerator;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_ONE_MKL
 | 
			
		||||
      cl::sycl::gpu_selector selector;
 | 
			
		||||
      cl::sycl::device selectedDevice { selector };
 | 
			
		||||
      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
 | 
			
		||||
      sycl::gpu_selector selector;
 | 
			
		||||
      sycl::device selectedDevice { selector };
 | 
			
		||||
      sycl::property_list q_prop{sycl::property::queue::in_order()};
 | 
			
		||||
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 | 
			
		||||
#endif
 | 
			
		||||
      gridblasInit=1;
 | 
			
		||||
 
 | 
			
		||||
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
 | 
			
		||||
      //Compute double precision rsd and also new RHS vector.
 | 
			
		||||
      Linop_d.HermOp(sol_d, tmp_d);
 | 
			
		||||
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 | 
			
		||||
      
 | 
			
		||||
      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 | 
			
		||||
 | 
			
		||||
      if(norm < OuterLoopNormMult * stop){
 | 
			
		||||
	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 | 
			
		||||
	break;
 | 
			
		||||
      }
 | 
			
		||||
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
			
		||||
      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
			
		||||
 | 
			
		||||
      PrecChangeTimer.Start();
 | 
			
		||||
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
 | 
			
		||||
 
 | 
			
		||||
@@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 | 
			
		||||
// very VERY rarely (Log, serial RNG) we need world without a grid
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifdef USE_GRID_REDUCTION
 | 
			
		||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumP2P(c);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumP2P(c);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumVector((float *)&c,2);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumVector((float *)c,2*N);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumVector((double *)&c,2);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumVector((float *)c,2*N);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
			
		||||
{
 | 
			
		||||
  GlobalSumVector((double *)c,2*N);
 | 
			
		||||
 
 | 
			
		||||
@@ -128,6 +128,34 @@ public:
 | 
			
		||||
  void GlobalXOR(uint32_t &);
 | 
			
		||||
  void GlobalXOR(uint64_t &);
 | 
			
		||||
  
 | 
			
		||||
  template<class obj> void GlobalSumP2P(obj &o)
 | 
			
		||||
  {
 | 
			
		||||
    std::vector<obj> column;
 | 
			
		||||
    obj accum = o;
 | 
			
		||||
    int source,dest;
 | 
			
		||||
    for(int d=0;d<_ndimension;d++){
 | 
			
		||||
      column.resize(_processors[d]);
 | 
			
		||||
      column[0] = accum;
 | 
			
		||||
      std::vector<CommsRequest_t> list;
 | 
			
		||||
      for(int p=1;p<_processors[d];p++){
 | 
			
		||||
	ShiftedRanks(d,p,source,dest);
 | 
			
		||||
	SendToRecvFromBegin(list,
 | 
			
		||||
			    &column[0],
 | 
			
		||||
			    dest,
 | 
			
		||||
			    &column[p],
 | 
			
		||||
			    source,
 | 
			
		||||
			    sizeof(obj),d*100+p);
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      CommsComplete(list);
 | 
			
		||||
      for(int p=1;p<_processors[d];p++){
 | 
			
		||||
	accum = accum + column[p];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    Broadcast(0,accum);
 | 
			
		||||
    o=accum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class obj> void GlobalSum(obj &o){
 | 
			
		||||
    typedef typename obj::scalar_type scalar_type;
 | 
			
		||||
    int words = sizeof(obj)/sizeof(scalar_type);
 | 
			
		||||
 
 | 
			
		||||
@@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator()
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#ifdef USE_GRID_REDUCTION
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  CartesianCommunicator::GlobalSumP2P(f);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  CartesianCommunicator::GlobalSumP2P(d);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
@@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d)
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
 
 | 
			
		||||
@@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
			
		||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
			
		||||
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
 | 
			
		||||
 | 
			
		||||
    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
 | 
			
		||||
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
 | 
			
		||||
    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
 | 
			
		||||
    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
 | 
			
		||||
      
 | 
			
		||||
    ze_ipc_mem_handle_t ihandle;
 | 
			
		||||
    clone_mem_t handle;
 | 
			
		||||
 
 | 
			
		||||
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define FAST_AXPY_NORM
 | 
			
		||||
template<class sobj,class vobj> inline
 | 
			
		||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
			
		||||
{
 | 
			
		||||
  GRID_TRACE("axpy_norm");
 | 
			
		||||
#ifdef FAST_AXPY_NORM
 | 
			
		||||
  return axpy_norm_fast(ret,a,x,y);
 | 
			
		||||
#else
 | 
			
		||||
  ret = a*x+y;
 | 
			
		||||
  RealD nn=norm2(ret);
 | 
			
		||||
  return nn;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
template<class sobj,class vobj> inline
 | 
			
		||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
			
		||||
{
 | 
			
		||||
  GRID_TRACE("axpby_norm");
 | 
			
		||||
#ifdef FAST_AXPY_NORM
 | 
			
		||||
  return axpby_norm_fast(ret,a,b,x,y);
 | 
			
		||||
#else
 | 
			
		||||
  ret = a*x+b*y;
 | 
			
		||||
  RealD nn=norm2(ret);
 | 
			
		||||
  return nn;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Trace product
 | 
			
		||||
 
 | 
			
		||||
@@ -290,8 +290,10 @@ template<class vobj>
 | 
			
		||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
			
		||||
  GridBase *grid = left.Grid();
 | 
			
		||||
 | 
			
		||||
  bool ok;
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  uint64_t csum=0;
 | 
			
		||||
  uint64_t csum2=0;
 | 
			
		||||
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
			
		||||
  {
 | 
			
		||||
    // Hack
 | 
			
		||||
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 | 
			
		||||
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
 | 
			
		||||
    uint64_t *base= (uint64_t *)&l_v[0];
 | 
			
		||||
    csum=svm_xor(base,words);
 | 
			
		||||
    ok = FlightRecorder::CsumLog(csum);
 | 
			
		||||
    if ( !ok ) {
 | 
			
		||||
      csum2=svm_xor(base,words);
 | 
			
		||||
      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
			
		||||
    } else {
 | 
			
		||||
      //      csum2=svm_xor(base,words);
 | 
			
		||||
      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    assert(ok);
 | 
			
		||||
  }
 | 
			
		||||
  FlightRecorder::CsumLog(csum);
 | 
			
		||||
#endif
 | 
			
		||||
  FlightRecorder::StepLog("rank inner product");
 | 
			
		||||
  ComplexD nrm = rankInnerProduct(left,right);
 | 
			
		||||
  //  ComplexD nrmck=nrm;
 | 
			
		||||
  RealD local = real(nrm);
 | 
			
		||||
  FlightRecorder::NormLog(real(nrm)); 
 | 
			
		||||
  ok = FlightRecorder::NormLog(real(nrm));
 | 
			
		||||
  if ( !ok ) {
 | 
			
		||||
    ComplexD nrm2 = rankInnerProduct(left,right);
 | 
			
		||||
    RealD local2 = real(nrm2);
 | 
			
		||||
    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
 | 
			
		||||
    assert(ok);
 | 
			
		||||
  }
 | 
			
		||||
  FlightRecorder::StepLog("Start global sum");
 | 
			
		||||
  //  grid->GlobalSumP2P(nrm);
 | 
			
		||||
  grid->GlobalSum(nrm);
 | 
			
		||||
  FlightRecorder::StepLog("Finished global sum");
 | 
			
		||||
  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
 | 
			
		||||
  FlightRecorder::ReductionLog(local,real(nrm)); 
 | 
			
		||||
  return nrm;
 | 
			
		||||
}
 | 
			
		||||
@@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 | 
			
		||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
 | 
			
		||||
      coalescedWrite(z_v[ss],tmp);
 | 
			
		||||
  });
 | 
			
		||||
  bool ok;
 | 
			
		||||
  uint64_t csum=0;
 | 
			
		||||
  uint64_t csum2=0;
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
			
		||||
  {
 | 
			
		||||
    // z_v
 | 
			
		||||
    {
 | 
			
		||||
      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
 | 
			
		||||
      uint64_t *base= (uint64_t *)&z_v[0];
 | 
			
		||||
      csum=svm_xor(base,words);
 | 
			
		||||
      ok = FlightRecorder::CsumLog(csum);
 | 
			
		||||
      if ( !ok ) {
 | 
			
		||||
	csum2=svm_xor(base,words);
 | 
			
		||||
	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      assert(ok);
 | 
			
		||||
    }
 | 
			
		||||
    // inner_v
 | 
			
		||||
    {
 | 
			
		||||
      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
 | 
			
		||||
      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
 | 
			
		||||
      csum=svm_xor(base,words);
 | 
			
		||||
      ok = FlightRecorder::CsumLog(csum);
 | 
			
		||||
      if ( !ok ) {
 | 
			
		||||
	csum2=svm_xor(base,words);
 | 
			
		||||
	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      assert(ok);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 | 
			
		||||
  ok = FlightRecorder::NormLog(real(nrm));
 | 
			
		||||
  assert(ok);
 | 
			
		||||
  RealD local = real(nrm);
 | 
			
		||||
  grid->GlobalSum(nrm);
 | 
			
		||||
  FlightRecorder::ReductionLog(local,real(nrm));
 | 
			
		||||
  return nrm; 
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
@@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 | 
			
		||||
  Integer nsimd= vobj::Nsimd();
 | 
			
		||||
  { 
 | 
			
		||||
    sycl::buffer<sobj, 1> abuff(&ret, {1});
 | 
			
		||||
    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
 | 
			
		||||
      cgh.parallel_for(cl::sycl::range<1>{osites},
 | 
			
		||||
    theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
			
		||||
      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
 | 
			
		||||
      cgh.parallel_for(sycl::range<1>{osites},
 | 
			
		||||
                      Reduction,
 | 
			
		||||
                      [=] (cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
                      [=] (sycl::id<1> item, auto &sum) {
 | 
			
		||||
                        auto osite   = item[0];
 | 
			
		||||
                        sum +=Reduce(lat[osite]);
 | 
			
		||||
                      });
 | 
			
		||||
@@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
 | 
			
		||||
  Word ret = 0;
 | 
			
		||||
  { 
 | 
			
		||||
    sycl::buffer<Word, 1> abuff(&ret, {1});
 | 
			
		||||
    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
 | 
			
		||||
      cgh.parallel_for(cl::sycl::range<1>{L},
 | 
			
		||||
    theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
			
		||||
      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
 | 
			
		||||
      cgh.parallel_for(sycl::range<1>{L},
 | 
			
		||||
                      Reduction,
 | 
			
		||||
                      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
			
		||||
                      [=] (sycl::id<1> index, auto &sum) {
 | 
			
		||||
                        sum ^=vec[index];
 | 
			
		||||
                      });
 | 
			
		||||
    });
 | 
			
		||||
 
 | 
			
		||||
@@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  for (int r = 0; r < rd; r++) {
 | 
			
		||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
 | 
			
		||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
			
		||||
      theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
			
		||||
          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
 | 
			
		||||
          cgh.parallel_for(sycl::range<1>{subvol_size},
 | 
			
		||||
          Reduction,
 | 
			
		||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
			
		||||
          [=](sycl::id<1> item, auto &sum) {
 | 
			
		||||
              auto s = item[0];
 | 
			
		||||
              sum += rb_p[r*subvol_size+s];
 | 
			
		||||
          });
 | 
			
		||||
 
 | 
			
		||||
@@ -364,9 +364,10 @@ public:
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    FlightRecorder::StepLog("Communicate begin");
 | 
			
		||||
    // All GPU kernel tasks must complete
 | 
			
		||||
    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
			
		||||
    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
			
		||||
    //    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
			
		||||
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
			
		||||
                               // But the HaloGather had a barrier too.
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
			
		||||
@@ -386,18 +387,20 @@ public:
 | 
			
		||||
 | 
			
		||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    FlightRecorder::StepLog("Start communicate complete");
 | 
			
		||||
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
 | 
			
		||||
    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
			
		||||
    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
			
		||||
    else DslashLogFull();
 | 
			
		||||
    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
 | 
			
		||||
    accelerator_barrier(); 
 | 
			
		||||
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
 | 
			
		||||
    //    accelerator_barrier(); 
 | 
			
		||||
    _grid->StencilBarrier(); 
 | 
			
		||||
    // run any checksums
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      if ( Packets[i].do_recv )
 | 
			
		||||
	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
 | 
			
		||||
    }
 | 
			
		||||
    FlightRecorder::StepLog("Finish communicate complete");
 | 
			
		||||
  }
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Blocking send and receive. Either sequential or parallel.
 | 
			
		||||
@@ -473,7 +476,7 @@ public:
 | 
			
		||||
  template<class compressor>
 | 
			
		||||
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
  {
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
    //    accelerator_barrier();
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
 | 
			
		||||
    assert(source.Grid()==_grid);
 | 
			
		||||
@@ -487,7 +490,7 @@ public:
 | 
			
		||||
      HaloGatherDir(source,compress,point,face_idx);
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_barrier(); // All my local gathers are complete
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    face_table_computed=1;
 | 
			
		||||
    assert(u_comm_offset==_unified_buffer_size);
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -202,13 +202,13 @@ void acceleratorInit(void)
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_SYCL
 | 
			
		||||
 | 
			
		||||
cl::sycl::queue *theGridAccelerator;
 | 
			
		||||
cl::sycl::queue *theCopyAccelerator;
 | 
			
		||||
sycl::queue *theGridAccelerator;
 | 
			
		||||
sycl::queue *theCopyAccelerator;
 | 
			
		||||
void acceleratorInit(void)
 | 
			
		||||
{
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  //  cl::sycl::gpu_selector selector;
 | 
			
		||||
  //  cl::sycl::device selectedDevice { selector };
 | 
			
		||||
  //  sycl::gpu_selector selector;
 | 
			
		||||
  //  sycl::device selectedDevice { selector };
 | 
			
		||||
  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
			
		||||
  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
			
		||||
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 | 
			
		||||
@@ -242,14 +242,14 @@ void acceleratorInit(void)
 | 
			
		||||
  gethostname(hostname, HOST_NAME_MAX+1);
 | 
			
		||||
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
 | 
			
		||||
 | 
			
		||||
  auto devices = cl::sycl::device::get_devices();
 | 
			
		||||
  auto devices = sycl::device::get_devices();
 | 
			
		||||
  for(int d = 0;d<devices.size();d++){
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_STR(prop) \
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP_FMT(prop,FMT) \
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
 | 
			
		||||
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
 | 
			
		||||
 | 
			
		||||
#define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
 | 
			
		||||
    if ( world_rank == 0) {
 | 
			
		||||
 
 | 
			
		||||
@@ -302,7 +302,7 @@ NAMESPACE_END(Grid);
 | 
			
		||||
 | 
			
		||||
// Force deterministic reductions
 | 
			
		||||
#define SYCL_REDUCTION_DETERMINISTIC
 | 
			
		||||
#include <sycl/CL/sycl.hpp>
 | 
			
		||||
#include <sycl/sycl.hpp>
 | 
			
		||||
#include <sycl/usm.hpp>
 | 
			
		||||
#include <level_zero/ze_api.h>
 | 
			
		||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
 | 
			
		||||
@@ -314,8 +314,8 @@ inline void acceleratorMem(void)
 | 
			
		||||
  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern cl::sycl::queue *theGridAccelerator;
 | 
			
		||||
extern cl::sycl::queue *theCopyAccelerator;
 | 
			
		||||
extern sycl::queue *theGridAccelerator;
 | 
			
		||||
extern sycl::queue *theCopyAccelerator;
 | 
			
		||||
 | 
			
		||||
#ifdef __SYCL_DEVICE_ONLY__
 | 
			
		||||
#define GRID_SIMT
 | 
			
		||||
@@ -326,24 +326,24 @@ extern cl::sycl::queue *theCopyAccelerator;
 | 
			
		||||
 | 
			
		||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
#ifdef GRID_SIMT
 | 
			
		||||
 return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
 | 
			
		||||
 return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 | 
			
		||||
#else
 | 
			
		||||
 return 0;
 | 
			
		||||
#endif
 | 
			
		||||
} // SYCL specific
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
 | 
			
		||||
  theGridAccelerator->submit([&](sycl::handler &cgh) {		\
 | 
			
		||||
    unsigned long nt=acceleratorThreads();				\
 | 
			
		||||
    if(nt < 8)nt=8;							\
 | 
			
		||||
    unsigned long unum1 = num1;						\
 | 
			
		||||
    unsigned long unum2 = num2;						\
 | 
			
		||||
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
 | 
			
		||||
    cl::sycl::range<3> local {nt,1,nsimd};				\
 | 
			
		||||
    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
 | 
			
		||||
    sycl::range<3> local {nt,1,nsimd};				\
 | 
			
		||||
    sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
 | 
			
		||||
    cgh.parallel_for(							\
 | 
			
		||||
		     cl::sycl::nd_range<3>(global,local),		\
 | 
			
		||||
		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
 | 
			
		||||
		     sycl::nd_range<3>(global,local),			\
 | 
			
		||||
		     [=] (sycl::nd_item<3> item) /*mutable*/		\
 | 
			
		||||
		     [[intel::reqd_sub_group_size(16)]]			\
 | 
			
		||||
		     {							\
 | 
			
		||||
		       auto iter1    = item.get_global_id(0);		\
 | 
			
		||||
@@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
 | 
			
		||||
inline int  acceleratorIsCommunicable(void *ptr)
 | 
			
		||||
{
 | 
			
		||||
#if 0
 | 
			
		||||
  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
 | 
			
		||||
  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
 | 
			
		||||
  auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
 | 
			
		||||
  if ( uvm = sycl::usm::alloc::shared ) return 1;
 | 
			
		||||
  else return 0;
 | 
			
		||||
#endif
 | 
			
		||||
  return 1;
 | 
			
		||||
 
 | 
			
		||||
@@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
 | 
			
		||||
int FlightRecorder::LoggingMode;
 | 
			
		||||
int FlightRecorder::ChecksumComms;
 | 
			
		||||
int FlightRecorder::ChecksumCommsSend;
 | 
			
		||||
const char *   FlightRecorder::StepName;
 | 
			
		||||
int32_t  FlightRecorder::StepLoggingCounter;
 | 
			
		||||
int32_t  FlightRecorder::XmitLoggingCounter;
 | 
			
		||||
int32_t  FlightRecorder::RecvLoggingCounter;
 | 
			
		||||
int32_t  FlightRecorder::CsumLoggingCounter;
 | 
			
		||||
@@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
 | 
			
		||||
  CsumLoggingCounter=0;
 | 
			
		||||
  NormLoggingCounter=0;
 | 
			
		||||
  ReductionLoggingCounter=0;
 | 
			
		||||
  StepName = "No steps started";
 | 
			
		||||
  StepLoggingCounter=0;
 | 
			
		||||
}
 | 
			
		||||
void FlightRecorder::Truncate(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
 | 
			
		||||
    assert(0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
bool FlightRecorder::StepLog(const char *name)
 | 
			
		||||
{
 | 
			
		||||
  StepName = name;
 | 
			
		||||
  StepLoggingCounter ++;
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void FlightRecorder::SetLoggingModePrint(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
 | 
			
		||||
{
 | 
			
		||||
  return ErrorCounter;
 | 
			
		||||
}
 | 
			
		||||
void FlightRecorder::NormLog(double value)
 | 
			
		||||
bool FlightRecorder::NormLog(double value)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t hex = * ( (uint64_t *)&value );
 | 
			
		||||
  if(LoggingMode == LoggingModePrint) {
 | 
			
		||||
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
			
		||||
    NormLoggingCounter++;
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
  if(LoggingMode == LoggingModeRecord) {
 | 
			
		||||
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
			
		||||
    NormLogVector.push_back(value);
 | 
			
		||||
    NormLoggingCounter++;
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
  if(LoggingMode == LoggingModeVerify) {
 | 
			
		||||
 | 
			
		||||
@@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
 | 
			
		||||
 | 
			
		||||
      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
 | 
			
		||||
 | 
			
		||||
	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
			
		||||
		FlightRecorder::StepLoggingCounter,
 | 
			
		||||
		FlightRecorder::StepName);
 | 
			
		||||
	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 | 
			
		||||
		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 | 
			
		||||
		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
 | 
			
		||||
@@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
 | 
			
		||||
		NormLoggingCounter,NormLogVector.size(),
 | 
			
		||||
		value, NormLogVector[NormLoggingCounter]); fflush(stderr);
 | 
			
		||||
 | 
			
		||||
	if(!ContinueOnFail)assert(0); // Force takedown of job
 | 
			
		||||
	BACKTRACEFP(stderr);
 | 
			
		||||
 | 
			
		||||
	if(!ContinueOnFail) return false;
 | 
			
		||||
	  
 | 
			
		||||
	ErrorCounter++;
 | 
			
		||||
      } else {
 | 
			
		||||
@@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
 | 
			
		||||
    }
 | 
			
		||||
    NormLoggingCounter++;
 | 
			
		||||
  }
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
void FlightRecorder::CsumLog(uint64_t hex)
 | 
			
		||||
bool FlightRecorder::CsumLog(uint64_t hex)
 | 
			
		||||
{
 | 
			
		||||
  if(LoggingMode == LoggingModePrint) {
 | 
			
		||||
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
			
		||||
    CsumLoggingCounter++;
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if(LoggingMode == LoggingModeRecord) {
 | 
			
		||||
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
			
		||||
    CsumLogVector.push_back(hex);
 | 
			
		||||
    CsumLoggingCounter++;
 | 
			
		||||
    return true;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if(LoggingMode == LoggingModeVerify) {
 | 
			
		||||
@@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
			
		||||
 | 
			
		||||
      if ( hex != hexref ) {
 | 
			
		||||
 | 
			
		||||
	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
			
		||||
		FlightRecorder::StepLoggingCounter,
 | 
			
		||||
		FlightRecorder::StepName);
 | 
			
		||||
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 | 
			
		||||
		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 | 
			
		||||
 | 
			
		||||
@@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
			
		||||
		GridHostname(),
 | 
			
		||||
		GlobalSharedMemory::WorldShmRank,
 | 
			
		||||
		CsumLoggingCounter,hex, hexref);
 | 
			
		||||
	BACKTRACEFP(stderr);
 | 
			
		||||
	fflush(stderr);
 | 
			
		||||
 | 
			
		||||
	if(!ContinueOnFail) assert(0); // Force takedown of job
 | 
			
		||||
	if(!ContinueOnFail) return false;
 | 
			
		||||
	  
 | 
			
		||||
	ErrorCounter++;
 | 
			
		||||
 | 
			
		||||
@@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
			
		||||
    }
 | 
			
		||||
    CsumLoggingCounter++;
 | 
			
		||||
  }
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void FlightRecorder::ReductionLog(double local,double global)
 | 
			
		||||
{
 | 
			
		||||
  uint64_t hex_l = * ( (uint64_t *)&local );
 | 
			
		||||
@@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
 | 
			
		||||
  if(LoggingMode == LoggingModeVerify) {
 | 
			
		||||
    if(ReductionLoggingCounter < ReductionLogVector.size()){
 | 
			
		||||
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
 | 
			
		||||
	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
			
		||||
		FlightRecorder::StepLoggingCounter,
 | 
			
		||||
		FlightRecorder::StepName);
 | 
			
		||||
	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 | 
			
		||||
		GridHostname(),
 | 
			
		||||
		GlobalSharedMemory::WorldShmRank,
 | 
			
		||||
		ReductionLoggingCounter,ReductionLogVector.size(),
 | 
			
		||||
		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 | 
			
		||||
	BACKTRACEFP(stderr);
 | 
			
		||||
	
 | 
			
		||||
	if ( !ContinueOnFail ) assert(0);
 | 
			
		||||
 | 
			
		||||
@@ -267,11 +297,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 | 
			
		||||
  if(LoggingMode == LoggingModeVerify) {
 | 
			
		||||
    if(XmitLoggingCounter < XmitLogVector.size()){
 | 
			
		||||
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
 | 
			
		||||
	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
			
		||||
		FlightRecorder::StepLoggingCounter,
 | 
			
		||||
		FlightRecorder::StepName);
 | 
			
		||||
	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 | 
			
		||||
		GridHostname(),
 | 
			
		||||
		GlobalSharedMemory::WorldShmRank,
 | 
			
		||||
		XmitLoggingCounter,XmitLogVector.size(),
 | 
			
		||||
		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 | 
			
		||||
	BACKTRACEFP(stderr);
 | 
			
		||||
	
 | 
			
		||||
	if ( !ContinueOnFail ) assert(0);
 | 
			
		||||
 | 
			
		||||
@@ -309,11 +343,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 | 
			
		||||
  if(LoggingMode == LoggingModeVerify) {
 | 
			
		||||
    if(RecvLoggingCounter < RecvLogVector.size()){
 | 
			
		||||
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
 | 
			
		||||
	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
			
		||||
		FlightRecorder::StepLoggingCounter,
 | 
			
		||||
		FlightRecorder::StepName);
 | 
			
		||||
	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 | 
			
		||||
		GridHostname(),
 | 
			
		||||
		GlobalSharedMemory::WorldShmRank,
 | 
			
		||||
		RecvLoggingCounter,RecvLogVector.size(),
 | 
			
		||||
		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 | 
			
		||||
	BACKTRACEFP(stderr);
 | 
			
		||||
	
 | 
			
		||||
	if ( !ContinueOnFail ) assert(0);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,8 @@ class FlightRecorder {
 | 
			
		||||
  
 | 
			
		||||
  static int                   LoggingMode;
 | 
			
		||||
  static uint64_t              ErrorCounter;
 | 
			
		||||
  static const char *                StepName;
 | 
			
		||||
  static int32_t               StepLoggingCounter;
 | 
			
		||||
  static int32_t               XmitLoggingCounter;
 | 
			
		||||
  static int32_t               RecvLoggingCounter;
 | 
			
		||||
  static int32_t               CsumLoggingCounter;
 | 
			
		||||
@@ -30,8 +32,9 @@ class FlightRecorder {
 | 
			
		||||
  static void SetLoggingModeRecord(void);
 | 
			
		||||
  static void SetLoggingModeVerify(void);
 | 
			
		||||
  static void SetLoggingMode(LoggingMode_t mode);
 | 
			
		||||
  static void NormLog(double value);
 | 
			
		||||
  static void CsumLog(uint64_t csum);
 | 
			
		||||
  static bool StepLog(const char *name);
 | 
			
		||||
  static bool NormLog(double value);
 | 
			
		||||
  static bool CsumLog(uint64_t csum);
 | 
			
		||||
  static void ReductionLog(double lcl, double glbl);
 | 
			
		||||
  static void Truncate(void);
 | 
			
		||||
  static void ResetCounters(void);
 | 
			
		||||
 
 | 
			
		||||
@@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
			
		||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
{
 | 
			
		||||
  fprintf(stderr,"Signal handler on host %s\n",hostname);
 | 
			
		||||
  fprintf(stderr,"FlightRecorder step %d stage %s \n",
 | 
			
		||||
	  FlightRecorder::StepLoggingCounter,
 | 
			
		||||
	  FlightRecorder::StepName);
 | 
			
		||||
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
 | 
			
		||||
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
			
		||||
  fprintf(stderr,"         code %d\n",si->si_code);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										14
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								configure.ac
									
									
									
									
									
								
							@@ -128,6 +128,20 @@ case ${ac_LAPACK} in
 | 
			
		||||
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### internal reduction
 | 
			
		||||
AC_ARG_ENABLE([reduction],
 | 
			
		||||
    [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
 | 
			
		||||
    [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
 | 
			
		||||
 | 
			
		||||
case ${ac_REDUCTION} in
 | 
			
		||||
    mpi)
 | 
			
		||||
        ;;
 | 
			
		||||
    grid)
 | 
			
		||||
        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
 | 
			
		||||
    *)
 | 
			
		||||
        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
############### tracing
 | 
			
		||||
AC_ARG_ENABLE([tracing],
 | 
			
		||||
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,14 @@
 | 
			
		||||
#Ahead of time compile for PVC
 | 
			
		||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 | 
			
		||||
 | 
			
		||||
#JIT compile 
 | 
			
		||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 | 
			
		||||
 | 
			
		||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions "
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-reduction=grid \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--enable-debug \
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,8 @@
 | 
			
		||||
module load oneapi/release/2023.12.15.001
 | 
			
		||||
#module load oneapi/release/2023.12.15.001
 | 
			
		||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
 | 
			
		||||
#module load mpich-config/mode/deterministic
 | 
			
		||||
#module load intel_compute_runtime/release/821.35
 | 
			
		||||
 | 
			
		||||
source ~/spack/share/spack/setup-env.sh 
 | 
			
		||||
spack load c-lime
 | 
			
		||||
spack load openssl
 | 
			
		||||
 
 | 
			
		||||
@@ -124,6 +124,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
			
		||||
  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
			
		||||
  //  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
			
		||||
  //  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
			
		||||
 | 
			
		||||
  int nsecs=600;
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
			
		||||
@@ -131,6 +133,10 @@ int main (int argc, char ** argv)
 | 
			
		||||
    GridCmdOptionInt(arg,nsecs);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
 | 
			
		||||
  UGrid->Barrier();
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
 | 
			
		||||
 | 
			
		||||
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
			
		||||
@@ -148,7 +154,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  FlightRecorder::ContinueOnFail = 0;
 | 
			
		||||
  FlightRecorder::PrintEntireLog = 0;
 | 
			
		||||
  FlightRecorder::ChecksumComms  = 1;
 | 
			
		||||
  FlightRecorder::ChecksumComms  = 0;
 | 
			
		||||
  FlightRecorder::ChecksumCommsSend=0;
 | 
			
		||||
 | 
			
		||||
  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user