mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			17 Commits
		
	
	
		
			9fa8bd6438
			...
			a78a61d76f
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| a78a61d76f | |||
| 2eff3f34ed | |||
| 03687c1d62 | |||
| febfe4e77f | |||
| 4d1aa134b5 | |||
| 5ec879860a | |||
| b728af903c | |||
| 54f1999030 | |||
| fd58f0b669 | |||
| c5c67b706e | |||
| be7a543e2c | |||
| 68f112d576 | |||
| ec1395a304 | |||
| beb0e474ee | |||
| 2b5fdcbbc5 | |||
| 295127d456 | |||
| 7dcfb13694 | 
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
  typedef cublasHandle_t gridblasHandle_t;
 | 
					  typedef cublasHandle_t gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
  typedef cl::sycl::queue *gridblasHandle_t;
 | 
					  typedef sycl::queue *gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_ONE_MKL
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
  typedef cl::sycl::queue *gridblasHandle_t;
 | 
					  typedef sycl::queue *gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
					#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
				
			||||||
  typedef int32_t gridblasHandle_t;
 | 
					  typedef int32_t gridblasHandle_t;
 | 
				
			||||||
@@ -89,9 +89,9 @@ public:
 | 
				
			|||||||
      gridblasHandle = theGridAccelerator;
 | 
					      gridblasHandle = theGridAccelerator;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_ONE_MKL
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
      cl::sycl::gpu_selector selector;
 | 
					      sycl::gpu_selector selector;
 | 
				
			||||||
      cl::sycl::device selectedDevice { selector };
 | 
					      sycl::device selectedDevice { selector };
 | 
				
			||||||
      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
 | 
					      sycl::property_list q_prop{sycl::property::queue::in_order()};
 | 
				
			||||||
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 | 
					      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      gridblasInit=1;
 | 
					      gridblasInit=1;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
      //Compute double precision rsd and also new RHS vector.
 | 
					      //Compute double precision rsd and also new RHS vector.
 | 
				
			||||||
      Linop_d.HermOp(sol_d, tmp_d);
 | 
					      Linop_d.HermOp(sol_d, tmp_d);
 | 
				
			||||||
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 | 
					      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 | 
				
			||||||
      
 | 
					      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
 | 
				
			||||||
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 | 
					      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      if(norm < OuterLoopNormMult * stop){
 | 
					      if(norm < OuterLoopNormMult * stop){
 | 
				
			||||||
	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 | 
						std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 | 
				
			||||||
	break;
 | 
						break;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
					      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      PrecChangeTimer.Start();
 | 
					      PrecChangeTimer.Start();
 | 
				
			||||||
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
 | 
					      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 | 
				
			|||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
 | 
					// very VERY rarely (Log, serial RNG) we need world without a grid
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef USE_GRID_REDUCTION
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(ComplexF &c)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  GlobalSumP2P(c);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(ComplexD &c)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  GlobalSumP2P(c);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
 | 
					void CartesianCommunicator::GlobalSum(ComplexF &c)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  GlobalSumVector((float *)&c,2);
 | 
					  GlobalSumVector((float *)&c,2);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  GlobalSumVector((float *)c,2*N);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
 | 
					void CartesianCommunicator::GlobalSum(ComplexD &c)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  GlobalSumVector((double *)&c,2);
 | 
					  GlobalSumVector((double *)&c,2);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  GlobalSumVector((float *)c,2*N);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
					void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  GlobalSumVector((double *)c,2*N);
 | 
					  GlobalSumVector((double *)c,2*N);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -128,6 +128,34 @@ public:
 | 
				
			|||||||
  void GlobalXOR(uint32_t &);
 | 
					  void GlobalXOR(uint32_t &);
 | 
				
			||||||
  void GlobalXOR(uint64_t &);
 | 
					  void GlobalXOR(uint64_t &);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  template<class obj> void GlobalSumP2P(obj &o)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::vector<obj> column;
 | 
				
			||||||
 | 
					    obj accum = o;
 | 
				
			||||||
 | 
					    int source,dest;
 | 
				
			||||||
 | 
					    for(int d=0;d<_ndimension;d++){
 | 
				
			||||||
 | 
					      column.resize(_processors[d]);
 | 
				
			||||||
 | 
					      column[0] = accum;
 | 
				
			||||||
 | 
					      std::vector<CommsRequest_t> list;
 | 
				
			||||||
 | 
					      for(int p=1;p<_processors[d];p++){
 | 
				
			||||||
 | 
						ShiftedRanks(d,p,source,dest);
 | 
				
			||||||
 | 
						SendToRecvFromBegin(list,
 | 
				
			||||||
 | 
								    &column[0],
 | 
				
			||||||
 | 
								    dest,
 | 
				
			||||||
 | 
								    &column[p],
 | 
				
			||||||
 | 
								    source,
 | 
				
			||||||
 | 
								    sizeof(obj),d*100+p);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      CommsComplete(list);
 | 
				
			||||||
 | 
					      for(int p=1;p<_processors[d];p++){
 | 
				
			||||||
 | 
						accum = accum + column[p];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    Broadcast(0,accum);
 | 
				
			||||||
 | 
					    o=accum;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class obj> void GlobalSum(obj &o){
 | 
					  template<class obj> void GlobalSum(obj &o){
 | 
				
			||||||
    typedef typename obj::scalar_type scalar_type;
 | 
					    typedef typename obj::scalar_type scalar_type;
 | 
				
			||||||
    int words = sizeof(obj)/sizeof(scalar_type);
 | 
					    int words = sizeof(obj)/sizeof(scalar_type);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator()
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					#ifdef USE_GRID_REDUCTION
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(float &f){
 | 
				
			||||||
 | 
					  CartesianCommunicator::GlobalSumP2P(f);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(double &d)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  CartesianCommunicator::GlobalSumP2P(d);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(float &f){
 | 
				
			||||||
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
				
			||||||
 | 
					  assert(ierr==0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					void CartesianCommunicator::GlobalSum(double &d)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
				
			||||||
 | 
					  assert(ierr==0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
					void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
				
			||||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  assert(ierr==0);
 | 
				
			||||||
@@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d)
 | 
				
			|||||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
 | 
					  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  assert(ierr==0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
					 | 
				
			||||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
					 | 
				
			||||||
  assert(ierr==0);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
					void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
					  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
				
			||||||
  assert(ierr==0);
 | 
					  assert(ierr==0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
					 | 
				
			||||||
  assert(ierr==0);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
					void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
					  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 | 
				
			|||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
					#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
 | 
					    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
 | 
					    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
 | 
				
			||||||
    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
 | 
					    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
    ze_ipc_mem_handle_t ihandle;
 | 
					    ze_ipc_mem_handle_t ihandle;
 | 
				
			||||||
    clone_mem_t handle;
 | 
					    clone_mem_t handle;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
 | 
				
			|||||||
  });
 | 
					  });
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define FAST_AXPY_NORM
 | 
				
			||||||
template<class sobj,class vobj> inline
 | 
					template<class sobj,class vobj> inline
 | 
				
			||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
					RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  GRID_TRACE("axpy_norm");
 | 
					  GRID_TRACE("axpy_norm");
 | 
				
			||||||
    return axpy_norm_fast(ret,a,x,y);
 | 
					#ifdef FAST_AXPY_NORM
 | 
				
			||||||
 | 
					  return axpy_norm_fast(ret,a,x,y);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  ret = a*x+y;
 | 
				
			||||||
 | 
					  RealD nn=norm2(ret);
 | 
				
			||||||
 | 
					  return nn;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
template<class sobj,class vobj> inline
 | 
					template<class sobj,class vobj> inline
 | 
				
			||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
					RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  GRID_TRACE("axpby_norm");
 | 
					  GRID_TRACE("axpby_norm");
 | 
				
			||||||
    return axpby_norm_fast(ret,a,b,x,y);
 | 
					#ifdef FAST_AXPY_NORM
 | 
				
			||||||
 | 
					  return axpby_norm_fast(ret,a,b,x,y);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  ret = a*x+b*y;
 | 
				
			||||||
 | 
					  RealD nn=norm2(ret);
 | 
				
			||||||
 | 
					  return nn;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// Trace product
 | 
					/// Trace product
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -290,8 +290,10 @@ template<class vobj>
 | 
				
			|||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
					inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
				
			||||||
  GridBase *grid = left.Grid();
 | 
					  GridBase *grid = left.Grid();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  bool ok;
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
  uint64_t csum=0;
 | 
					  uint64_t csum=0;
 | 
				
			||||||
 | 
					  uint64_t csum2=0;
 | 
				
			||||||
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
					  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    // Hack
 | 
					    // Hack
 | 
				
			||||||
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 | 
				
			|||||||
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
 | 
					    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
 | 
				
			||||||
    uint64_t *base= (uint64_t *)&l_v[0];
 | 
					    uint64_t *base= (uint64_t *)&l_v[0];
 | 
				
			||||||
    csum=svm_xor(base,words);
 | 
					    csum=svm_xor(base,words);
 | 
				
			||||||
 | 
					    ok = FlightRecorder::CsumLog(csum);
 | 
				
			||||||
 | 
					    if ( !ok ) {
 | 
				
			||||||
 | 
					      csum2=svm_xor(base,words);
 | 
				
			||||||
 | 
					      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      //      csum2=svm_xor(base,words);
 | 
				
			||||||
 | 
					      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    assert(ok);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  FlightRecorder::CsumLog(csum);
 | 
					 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					  FlightRecorder::StepLog("rank inner product");
 | 
				
			||||||
  ComplexD nrm = rankInnerProduct(left,right);
 | 
					  ComplexD nrm = rankInnerProduct(left,right);
 | 
				
			||||||
 | 
					  //  ComplexD nrmck=nrm;
 | 
				
			||||||
  RealD local = real(nrm);
 | 
					  RealD local = real(nrm);
 | 
				
			||||||
  FlightRecorder::NormLog(real(nrm)); 
 | 
					  ok = FlightRecorder::NormLog(real(nrm));
 | 
				
			||||||
 | 
					  if ( !ok ) {
 | 
				
			||||||
 | 
					    ComplexD nrm2 = rankInnerProduct(left,right);
 | 
				
			||||||
 | 
					    RealD local2 = real(nrm2);
 | 
				
			||||||
 | 
					    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
 | 
				
			||||||
 | 
					    assert(ok);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  FlightRecorder::StepLog("Start global sum");
 | 
				
			||||||
 | 
					  //  grid->GlobalSumP2P(nrm);
 | 
				
			||||||
  grid->GlobalSum(nrm);
 | 
					  grid->GlobalSum(nrm);
 | 
				
			||||||
 | 
					  FlightRecorder::StepLog("Finished global sum");
 | 
				
			||||||
 | 
					  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
 | 
				
			||||||
  FlightRecorder::ReductionLog(local,real(nrm)); 
 | 
					  FlightRecorder::ReductionLog(local,real(nrm)); 
 | 
				
			||||||
  return nrm;
 | 
					  return nrm;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
 | 
				
			|||||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
 | 
					      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
 | 
				
			||||||
      coalescedWrite(z_v[ss],tmp);
 | 
					      coalescedWrite(z_v[ss],tmp);
 | 
				
			||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					  bool ok;
 | 
				
			||||||
 | 
					  uint64_t csum=0;
 | 
				
			||||||
 | 
					  uint64_t csum2=0;
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    // z_v
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
 | 
				
			||||||
 | 
					      uint64_t *base= (uint64_t *)&z_v[0];
 | 
				
			||||||
 | 
					      csum=svm_xor(base,words);
 | 
				
			||||||
 | 
					      ok = FlightRecorder::CsumLog(csum);
 | 
				
			||||||
 | 
					      if ( !ok ) {
 | 
				
			||||||
 | 
						csum2=svm_xor(base,words);
 | 
				
			||||||
 | 
						std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      assert(ok);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // inner_v
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
 | 
				
			||||||
 | 
					      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
 | 
				
			||||||
 | 
					      csum=svm_xor(base,words);
 | 
				
			||||||
 | 
					      ok = FlightRecorder::CsumLog(csum);
 | 
				
			||||||
 | 
					      if ( !ok ) {
 | 
				
			||||||
 | 
						csum2=svm_xor(base,words);
 | 
				
			||||||
 | 
						std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      assert(ok);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 | 
					  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
 | 
				
			||||||
 | 
					  ok = FlightRecorder::NormLog(real(nrm));
 | 
				
			||||||
 | 
					  assert(ok);
 | 
				
			||||||
 | 
					  RealD local = real(nrm);
 | 
				
			||||||
  grid->GlobalSum(nrm);
 | 
					  grid->GlobalSum(nrm);
 | 
				
			||||||
 | 
					  FlightRecorder::ReductionLog(local,real(nrm));
 | 
				
			||||||
  return nrm; 
 | 
					  return nrm; 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
 | 
				
			|||||||
  Integer nsimd= vobj::Nsimd();
 | 
					  Integer nsimd= vobj::Nsimd();
 | 
				
			||||||
  { 
 | 
					  { 
 | 
				
			||||||
    sycl::buffer<sobj, 1> abuff(&ret, {1});
 | 
					    sycl::buffer<sobj, 1> abuff(&ret, {1});
 | 
				
			||||||
    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
					    theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
				
			||||||
      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
 | 
					      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
 | 
				
			||||||
      cgh.parallel_for(cl::sycl::range<1>{osites},
 | 
					      cgh.parallel_for(sycl::range<1>{osites},
 | 
				
			||||||
                      Reduction,
 | 
					                      Reduction,
 | 
				
			||||||
                      [=] (cl::sycl::id<1> item, auto &sum) {
 | 
					                      [=] (sycl::id<1> item, auto &sum) {
 | 
				
			||||||
                        auto osite   = item[0];
 | 
					                        auto osite   = item[0];
 | 
				
			||||||
                        sum +=Reduce(lat[osite]);
 | 
					                        sum +=Reduce(lat[osite]);
 | 
				
			||||||
                      });
 | 
					                      });
 | 
				
			||||||
@@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
 | 
				
			|||||||
  Word ret = 0;
 | 
					  Word ret = 0;
 | 
				
			||||||
  { 
 | 
					  { 
 | 
				
			||||||
    sycl::buffer<Word, 1> abuff(&ret, {1});
 | 
					    sycl::buffer<Word, 1> abuff(&ret, {1});
 | 
				
			||||||
    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
					    theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
				
			||||||
      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
 | 
					      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
 | 
				
			||||||
      cgh.parallel_for(cl::sycl::range<1>{L},
 | 
					      cgh.parallel_for(sycl::range<1>{L},
 | 
				
			||||||
                      Reduction,
 | 
					                      Reduction,
 | 
				
			||||||
                      [=] (cl::sycl::id<1> index, auto &sum) {
 | 
					                      [=] (sycl::id<1> index, auto &sum) {
 | 
				
			||||||
                        sum ^=vec[index];
 | 
					                        sum ^=vec[index];
 | 
				
			||||||
                      });
 | 
					                      });
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
 | 
				
			|||||||
  });
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  for (int r = 0; r < rd; r++) {
 | 
					  for (int r = 0; r < rd; r++) {
 | 
				
			||||||
      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
 | 
					      theGridAccelerator->submit([&](sycl::handler &cgh) {
 | 
				
			||||||
          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
 | 
					          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
 | 
				
			||||||
          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
 | 
					          cgh.parallel_for(sycl::range<1>{subvol_size},
 | 
				
			||||||
          Reduction,
 | 
					          Reduction,
 | 
				
			||||||
          [=](cl::sycl::id<1> item, auto &sum) {
 | 
					          [=](sycl::id<1> item, auto &sum) {
 | 
				
			||||||
              auto s = item[0];
 | 
					              auto s = item[0];
 | 
				
			||||||
              sum += rb_p[r*subvol_size+s];
 | 
					              sum += rb_p[r*subvol_size+s];
 | 
				
			||||||
          });
 | 
					          });
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -364,9 +364,10 @@ public:
 | 
				
			|||||||
  ////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
					  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
 | 
					    FlightRecorder::StepLog("Communicate begin");
 | 
				
			||||||
    // All GPU kernel tasks must complete
 | 
					    // All GPU kernel tasks must complete
 | 
				
			||||||
    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
					    //    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
				
			||||||
    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
					    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
				
			||||||
                               // But the HaloGather had a barrier too.
 | 
					                               // But the HaloGather had a barrier too.
 | 
				
			||||||
    for(int i=0;i<Packets.size();i++){
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
					      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
				
			||||||
@@ -386,18 +387,20 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
					  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
 | 
					    FlightRecorder::StepLog("Start communicate complete");
 | 
				
			||||||
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
 | 
					    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
 | 
				
			||||||
    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
					    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
				
			||||||
    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
					    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
				
			||||||
    else DslashLogFull();
 | 
					    else DslashLogFull();
 | 
				
			||||||
    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
 | 
					    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
 | 
				
			||||||
    accelerator_barrier(); 
 | 
					    //    accelerator_barrier(); 
 | 
				
			||||||
    _grid->StencilBarrier(); 
 | 
					    _grid->StencilBarrier(); 
 | 
				
			||||||
    // run any checksums
 | 
					    // run any checksums
 | 
				
			||||||
    for(int i=0;i<Packets.size();i++){
 | 
					    for(int i=0;i<Packets.size();i++){
 | 
				
			||||||
      if ( Packets[i].do_recv )
 | 
					      if ( Packets[i].do_recv )
 | 
				
			||||||
	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
 | 
						FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    FlightRecorder::StepLog("Finish communicate complete");
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  ////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Blocking send and receive. Either sequential or parallel.
 | 
					  // Blocking send and receive. Either sequential or parallel.
 | 
				
			||||||
@@ -473,7 +476,7 @@ public:
 | 
				
			|||||||
  template<class compressor>
 | 
					  template<class compressor>
 | 
				
			||||||
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
					  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    accelerator_barrier();
 | 
					    //    accelerator_barrier();
 | 
				
			||||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
					    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert(source.Grid()==_grid);
 | 
					    assert(source.Grid()==_grid);
 | 
				
			||||||
@@ -487,7 +490,7 @@ public:
 | 
				
			|||||||
      HaloGatherDir(source,compress,point,face_idx);
 | 
					      HaloGatherDir(source,compress,point,face_idx);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    accelerator_barrier(); // All my local gathers are complete
 | 
					    accelerator_barrier(); // All my local gathers are complete
 | 
				
			||||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
					    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
				
			||||||
    face_table_computed=1;
 | 
					    face_table_computed=1;
 | 
				
			||||||
    assert(u_comm_offset==_unified_buffer_size);
 | 
					    assert(u_comm_offset==_unified_buffer_size);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -202,13 +202,13 @@ void acceleratorInit(void)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cl::sycl::queue *theGridAccelerator;
 | 
					sycl::queue *theGridAccelerator;
 | 
				
			||||||
cl::sycl::queue *theCopyAccelerator;
 | 
					sycl::queue *theCopyAccelerator;
 | 
				
			||||||
void acceleratorInit(void)
 | 
					void acceleratorInit(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  int nDevices = 1;
 | 
					  int nDevices = 1;
 | 
				
			||||||
  //  cl::sycl::gpu_selector selector;
 | 
					  //  sycl::gpu_selector selector;
 | 
				
			||||||
  //  cl::sycl::device selectedDevice { selector };
 | 
					  //  sycl::device selectedDevice { selector };
 | 
				
			||||||
  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
					  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
				
			||||||
  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
					  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
 | 
				
			||||||
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 | 
					  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
 | 
				
			||||||
@@ -242,14 +242,14 @@ void acceleratorInit(void)
 | 
				
			|||||||
  gethostname(hostname, HOST_NAME_MAX+1);
 | 
					  gethostname(hostname, HOST_NAME_MAX+1);
 | 
				
			||||||
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
 | 
					  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  auto devices = cl::sycl::device::get_devices();
 | 
					  auto devices = sycl::device::get_devices();
 | 
				
			||||||
  for(int d = 0;d<devices.size();d++){
 | 
					  for(int d = 0;d<devices.size();d++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define GPU_PROP_STR(prop) \
 | 
					#define GPU_PROP_STR(prop) \
 | 
				
			||||||
    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
 | 
					    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define GPU_PROP_FMT(prop,FMT) \
 | 
					#define GPU_PROP_FMT(prop,FMT) \
 | 
				
			||||||
    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
 | 
					    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
 | 
					#define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
 | 
				
			||||||
    if ( world_rank == 0) {
 | 
					    if ( world_rank == 0) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -302,7 +302,7 @@ NAMESPACE_END(Grid);
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
// Force deterministic reductions
 | 
					// Force deterministic reductions
 | 
				
			||||||
#define SYCL_REDUCTION_DETERMINISTIC
 | 
					#define SYCL_REDUCTION_DETERMINISTIC
 | 
				
			||||||
#include <sycl/CL/sycl.hpp>
 | 
					#include <sycl/sycl.hpp>
 | 
				
			||||||
#include <sycl/usm.hpp>
 | 
					#include <sycl/usm.hpp>
 | 
				
			||||||
#include <level_zero/ze_api.h>
 | 
					#include <level_zero/ze_api.h>
 | 
				
			||||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
 | 
					#include <sycl/ext/oneapi/backend/level_zero.hpp>
 | 
				
			||||||
@@ -314,8 +314,8 @@ inline void acceleratorMem(void)
 | 
				
			|||||||
  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 | 
					  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern cl::sycl::queue *theGridAccelerator;
 | 
					extern sycl::queue *theGridAccelerator;
 | 
				
			||||||
extern cl::sycl::queue *theCopyAccelerator;
 | 
					extern sycl::queue *theCopyAccelerator;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __SYCL_DEVICE_ONLY__
 | 
					#ifdef __SYCL_DEVICE_ONLY__
 | 
				
			||||||
#define GRID_SIMT
 | 
					#define GRID_SIMT
 | 
				
			||||||
@@ -326,24 +326,24 @@ extern cl::sycl::queue *theCopyAccelerator;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
					accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
				
			||||||
#ifdef GRID_SIMT
 | 
					#ifdef GRID_SIMT
 | 
				
			||||||
 return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
 | 
					 return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
 return 0;
 | 
					 return 0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
} // SYCL specific
 | 
					} // SYCL specific
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
					#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
				
			||||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
 | 
					  theGridAccelerator->submit([&](sycl::handler &cgh) {		\
 | 
				
			||||||
    unsigned long nt=acceleratorThreads();				\
 | 
					    unsigned long nt=acceleratorThreads();				\
 | 
				
			||||||
    if(nt < 8)nt=8;							\
 | 
					    if(nt < 8)nt=8;							\
 | 
				
			||||||
    unsigned long unum1 = num1;						\
 | 
					    unsigned long unum1 = num1;						\
 | 
				
			||||||
    unsigned long unum2 = num2;						\
 | 
					    unsigned long unum2 = num2;						\
 | 
				
			||||||
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
 | 
					    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
 | 
				
			||||||
    cl::sycl::range<3> local {nt,1,nsimd};				\
 | 
					    sycl::range<3> local {nt,1,nsimd};				\
 | 
				
			||||||
    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
 | 
					    sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
 | 
				
			||||||
    cgh.parallel_for(							\
 | 
					    cgh.parallel_for(							\
 | 
				
			||||||
		     cl::sycl::nd_range<3>(global,local),		\
 | 
							     sycl::nd_range<3>(global,local),			\
 | 
				
			||||||
		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
 | 
							     [=] (sycl::nd_item<3> item) /*mutable*/		\
 | 
				
			||||||
		     [[intel::reqd_sub_group_size(16)]]			\
 | 
							     [[intel::reqd_sub_group_size(16)]]			\
 | 
				
			||||||
		     {							\
 | 
							     {							\
 | 
				
			||||||
		       auto iter1    = item.get_global_id(0);		\
 | 
							       auto iter1    = item.get_global_id(0);		\
 | 
				
			||||||
@@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
 | 
				
			|||||||
inline int  acceleratorIsCommunicable(void *ptr)
 | 
					inline int  acceleratorIsCommunicable(void *ptr)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
#if 0
 | 
					#if 0
 | 
				
			||||||
  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
 | 
					  auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
 | 
				
			||||||
  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
 | 
					  if ( uvm = sycl::usm::alloc::shared ) return 1;
 | 
				
			||||||
  else return 0;
 | 
					  else return 0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  return 1;
 | 
					  return 1;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
 | 
				
			|||||||
int FlightRecorder::LoggingMode;
 | 
					int FlightRecorder::LoggingMode;
 | 
				
			||||||
int FlightRecorder::ChecksumComms;
 | 
					int FlightRecorder::ChecksumComms;
 | 
				
			||||||
int FlightRecorder::ChecksumCommsSend;
 | 
					int FlightRecorder::ChecksumCommsSend;
 | 
				
			||||||
 | 
					const char *   FlightRecorder::StepName;
 | 
				
			||||||
 | 
					int32_t  FlightRecorder::StepLoggingCounter;
 | 
				
			||||||
int32_t  FlightRecorder::XmitLoggingCounter;
 | 
					int32_t  FlightRecorder::XmitLoggingCounter;
 | 
				
			||||||
int32_t  FlightRecorder::RecvLoggingCounter;
 | 
					int32_t  FlightRecorder::RecvLoggingCounter;
 | 
				
			||||||
int32_t  FlightRecorder::CsumLoggingCounter;
 | 
					int32_t  FlightRecorder::CsumLoggingCounter;
 | 
				
			||||||
@@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
 | 
				
			|||||||
  CsumLoggingCounter=0;
 | 
					  CsumLoggingCounter=0;
 | 
				
			||||||
  NormLoggingCounter=0;
 | 
					  NormLoggingCounter=0;
 | 
				
			||||||
  ReductionLoggingCounter=0;
 | 
					  ReductionLoggingCounter=0;
 | 
				
			||||||
 | 
					  StepName = "No steps started";
 | 
				
			||||||
 | 
					  StepLoggingCounter=0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void FlightRecorder::Truncate(void)
 | 
					void FlightRecorder::Truncate(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
 | 
				
			|||||||
    assert(0);
 | 
					    assert(0);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					bool FlightRecorder::StepLog(const char *name)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  StepName = name;
 | 
				
			||||||
 | 
					  StepLoggingCounter ++;
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void FlightRecorder::SetLoggingModePrint(void)
 | 
					void FlightRecorder::SetLoggingModePrint(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  return ErrorCounter;
 | 
					  return ErrorCounter;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void FlightRecorder::NormLog(double value)
 | 
					bool FlightRecorder::NormLog(double value)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  uint64_t hex = * ( (uint64_t *)&value );
 | 
					  uint64_t hex = * ( (uint64_t *)&value );
 | 
				
			||||||
  if(LoggingMode == LoggingModePrint) {
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
					    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
    NormLoggingCounter++;
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if(LoggingMode == LoggingModeRecord) {
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
					    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
    NormLogVector.push_back(value);
 | 
					    NormLogVector.push_back(value);
 | 
				
			||||||
    NormLoggingCounter++;
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if(LoggingMode == LoggingModeVerify) {
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
 | 
					      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
				
			||||||
 | 
							FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
							FlightRecorder::StepName);
 | 
				
			||||||
	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 | 
						std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 | 
				
			||||||
		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 | 
							 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 | 
				
			||||||
		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
 | 
							 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
 | 
				
			||||||
@@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
 | 
				
			|||||||
		NormLoggingCounter,NormLogVector.size(),
 | 
							NormLoggingCounter,NormLogVector.size(),
 | 
				
			||||||
		value, NormLogVector[NormLoggingCounter]); fflush(stderr);
 | 
							value, NormLogVector[NormLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if(!ContinueOnFail)assert(0); // Force takedown of job
 | 
						BACKTRACEFP(stderr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(!ContinueOnFail) return false;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	ErrorCounter++;
 | 
						ErrorCounter++;
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
@@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    NormLoggingCounter++;
 | 
					    NormLoggingCounter++;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
void FlightRecorder::CsumLog(uint64_t hex)
 | 
					bool FlightRecorder::CsumLog(uint64_t hex)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  if(LoggingMode == LoggingModePrint) {
 | 
					  if(LoggingMode == LoggingModePrint) {
 | 
				
			||||||
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
					    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
    CsumLoggingCounter++;
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if(LoggingMode == LoggingModeRecord) {
 | 
					  if(LoggingMode == LoggingModeRecord) {
 | 
				
			||||||
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
					    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
 | 
				
			||||||
    CsumLogVector.push_back(hex);
 | 
					    CsumLogVector.push_back(hex);
 | 
				
			||||||
    CsumLoggingCounter++;
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if(LoggingMode == LoggingModeVerify) {
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
@@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      if ( hex != hexref ) {
 | 
					      if ( hex != hexref ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
				
			||||||
 | 
							FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
							FlightRecorder::StepName);
 | 
				
			||||||
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 | 
					        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 | 
				
			||||||
		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 | 
							 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
				
			|||||||
		GridHostname(),
 | 
							GridHostname(),
 | 
				
			||||||
		GlobalSharedMemory::WorldShmRank,
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
		CsumLoggingCounter,hex, hexref);
 | 
							CsumLoggingCounter,hex, hexref);
 | 
				
			||||||
 | 
						BACKTRACEFP(stderr);
 | 
				
			||||||
	fflush(stderr);
 | 
						fflush(stderr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if(!ContinueOnFail) assert(0); // Force takedown of job
 | 
						if(!ContinueOnFail) return false;
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	ErrorCounter++;
 | 
						ErrorCounter++;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    CsumLoggingCounter++;
 | 
					    CsumLoggingCounter++;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void FlightRecorder::ReductionLog(double local,double global)
 | 
					void FlightRecorder::ReductionLog(double local,double global)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  uint64_t hex_l = * ( (uint64_t *)&local );
 | 
					  uint64_t hex_l = * ( (uint64_t *)&local );
 | 
				
			||||||
@@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
 | 
				
			|||||||
  if(LoggingMode == LoggingModeVerify) {
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
    if(ReductionLoggingCounter < ReductionLogVector.size()){
 | 
					    if(ReductionLoggingCounter < ReductionLogVector.size()){
 | 
				
			||||||
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
 | 
					      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
				
			||||||
 | 
							FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
							FlightRecorder::StepName);
 | 
				
			||||||
	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 | 
						fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 | 
				
			||||||
		GridHostname(),
 | 
							GridHostname(),
 | 
				
			||||||
		GlobalSharedMemory::WorldShmRank,
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
		ReductionLoggingCounter,ReductionLogVector.size(),
 | 
							ReductionLoggingCounter,ReductionLogVector.size(),
 | 
				
			||||||
		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 | 
							global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
						BACKTRACEFP(stderr);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	if ( !ContinueOnFail ) assert(0);
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -267,11 +297,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 | 
				
			|||||||
  if(LoggingMode == LoggingModeVerify) {
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
    if(XmitLoggingCounter < XmitLogVector.size()){
 | 
					    if(XmitLoggingCounter < XmitLogVector.size()){
 | 
				
			||||||
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
 | 
					      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
				
			||||||
 | 
							FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
							FlightRecorder::StepName);
 | 
				
			||||||
	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 | 
						fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 | 
				
			||||||
		GridHostname(),
 | 
							GridHostname(),
 | 
				
			||||||
		GlobalSharedMemory::WorldShmRank,
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
		XmitLoggingCounter,XmitLogVector.size(),
 | 
							XmitLoggingCounter,XmitLogVector.size(),
 | 
				
			||||||
		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 | 
							_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
 | 
				
			||||||
 | 
						BACKTRACEFP(stderr);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	if ( !ContinueOnFail ) assert(0);
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -309,11 +343,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 | 
				
			|||||||
  if(LoggingMode == LoggingModeVerify) {
 | 
					  if(LoggingMode == LoggingModeVerify) {
 | 
				
			||||||
    if(RecvLoggingCounter < RecvLogVector.size()){
 | 
					    if(RecvLoggingCounter < RecvLogVector.size()){
 | 
				
			||||||
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
 | 
					      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
 | 
				
			||||||
 | 
						fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
 | 
				
			||||||
 | 
							FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
							FlightRecorder::StepName);
 | 
				
			||||||
	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 | 
						fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 | 
				
			||||||
		GridHostname(),
 | 
							GridHostname(),
 | 
				
			||||||
		GlobalSharedMemory::WorldShmRank,
 | 
							GlobalSharedMemory::WorldShmRank,
 | 
				
			||||||
		RecvLoggingCounter,RecvLogVector.size(),
 | 
							RecvLoggingCounter,RecvLogVector.size(),
 | 
				
			||||||
		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 | 
							_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
 | 
				
			||||||
 | 
						BACKTRACEFP(stderr);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	if ( !ContinueOnFail ) assert(0);
 | 
						if ( !ContinueOnFail ) assert(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -12,6 +12,8 @@ class FlightRecorder {
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  static int                   LoggingMode;
 | 
					  static int                   LoggingMode;
 | 
				
			||||||
  static uint64_t              ErrorCounter;
 | 
					  static uint64_t              ErrorCounter;
 | 
				
			||||||
 | 
					  static const char *                StepName;
 | 
				
			||||||
 | 
					  static int32_t               StepLoggingCounter;
 | 
				
			||||||
  static int32_t               XmitLoggingCounter;
 | 
					  static int32_t               XmitLoggingCounter;
 | 
				
			||||||
  static int32_t               RecvLoggingCounter;
 | 
					  static int32_t               RecvLoggingCounter;
 | 
				
			||||||
  static int32_t               CsumLoggingCounter;
 | 
					  static int32_t               CsumLoggingCounter;
 | 
				
			||||||
@@ -30,8 +32,9 @@ class FlightRecorder {
 | 
				
			|||||||
  static void SetLoggingModeRecord(void);
 | 
					  static void SetLoggingModeRecord(void);
 | 
				
			||||||
  static void SetLoggingModeVerify(void);
 | 
					  static void SetLoggingModeVerify(void);
 | 
				
			||||||
  static void SetLoggingMode(LoggingMode_t mode);
 | 
					  static void SetLoggingMode(LoggingMode_t mode);
 | 
				
			||||||
  static void NormLog(double value);
 | 
					  static bool StepLog(const char *name);
 | 
				
			||||||
  static void CsumLog(uint64_t csum);
 | 
					  static bool NormLog(double value);
 | 
				
			||||||
 | 
					  static bool CsumLog(uint64_t csum);
 | 
				
			||||||
  static void ReductionLog(double lcl, double glbl);
 | 
					  static void ReductionLog(double lcl, double glbl);
 | 
				
			||||||
  static void Truncate(void);
 | 
					  static void Truncate(void);
 | 
				
			||||||
  static void ResetCounters(void);
 | 
					  static void ResetCounters(void);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
				
			|||||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
					void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  fprintf(stderr,"Signal handler on host %s\n",hostname);
 | 
					  fprintf(stderr,"Signal handler on host %s\n",hostname);
 | 
				
			||||||
 | 
					  fprintf(stderr,"FlightRecorder step %d stage %s \n",
 | 
				
			||||||
 | 
						  FlightRecorder::StepLoggingCounter,
 | 
				
			||||||
 | 
						  FlightRecorder::StepName);
 | 
				
			||||||
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
 | 
					  fprintf(stderr,"Caught signal %d\n",si->si_signo);
 | 
				
			||||||
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
					  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
				
			||||||
  fprintf(stderr,"         code %d\n",si->si_code);
 | 
					  fprintf(stderr,"         code %d\n",si->si_code);
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										14
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								configure.ac
									
									
									
									
									
								
							@@ -128,6 +128,20 @@ case ${ac_LAPACK} in
 | 
				
			|||||||
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 | 
					        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 | 
				
			||||||
esac
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					############### internal reduction
 | 
				
			||||||
 | 
					AC_ARG_ENABLE([reduction],
 | 
				
			||||||
 | 
					    [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
 | 
				
			||||||
 | 
					    [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					case ${ac_REDUCTION} in
 | 
				
			||||||
 | 
					    mpi)
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    grid)
 | 
				
			||||||
 | 
					        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
 | 
				
			||||||
 | 
					    *)
 | 
				
			||||||
 | 
					        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
############### tracing
 | 
					############### tracing
 | 
				
			||||||
AC_ARG_ENABLE([tracing],
 | 
					AC_ARG_ENABLE([tracing],
 | 
				
			||||||
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
 | 
					    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,8 +1,14 @@
 | 
				
			|||||||
 | 
					#Ahead of time compile for PVC
 | 
				
			||||||
 | 
					export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
				
			||||||
 | 
					export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#JIT compile 
 | 
				
			||||||
 | 
					#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
				
			||||||
 | 
					#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
					 | 
				
			||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions "
 | 
					 | 
				
			||||||
../../configure \
 | 
					../../configure \
 | 
				
			||||||
	--enable-simd=GPU \
 | 
						--enable-simd=GPU \
 | 
				
			||||||
 | 
						--enable-reduction=grid \
 | 
				
			||||||
	--enable-gen-simd-width=64 \
 | 
						--enable-gen-simd-width=64 \
 | 
				
			||||||
	--enable-comms=mpi-auto \
 | 
						--enable-comms=mpi-auto \
 | 
				
			||||||
	--enable-debug \
 | 
						--enable-debug \
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,7 +1,8 @@
 | 
				
			|||||||
module load oneapi/release/2023.12.15.001
 | 
					#module load oneapi/release/2023.12.15.001
 | 
				
			||||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
 | 
					#module load mpich/icc-all-debug-pmix-gpu/52.2
 | 
				
			||||||
#module load mpich-config/mode/deterministic
 | 
					#module load mpich-config/mode/deterministic
 | 
				
			||||||
#module load intel_compute_runtime/release/821.35
 | 
					#module load intel_compute_runtime/release/821.35
 | 
				
			||||||
 | 
					
 | 
				
			||||||
source ~/spack/share/spack/setup-env.sh 
 | 
					source ~/spack/share/spack/setup-env.sh 
 | 
				
			||||||
spack load c-lime
 | 
					spack load c-lime
 | 
				
			||||||
spack load openssl
 | 
					spack load openssl
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -124,6 +124,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
					  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
				
			||||||
  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
					  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
				
			||||||
 | 
					  //  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
				
			||||||
 | 
					  //  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int nsecs=600;
 | 
					  int nsecs=600;
 | 
				
			||||||
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
				
			||||||
@@ -131,6 +133,10 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    GridCmdOptionInt(arg,nsecs);
 | 
					    GridCmdOptionInt(arg,nsecs);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
 | 
				
			||||||
 | 
					  UGrid->Barrier();
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
 | 
					  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
					  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
				
			||||||
@@ -148,7 +154,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  FlightRecorder::ContinueOnFail = 0;
 | 
					  FlightRecorder::ContinueOnFail = 0;
 | 
				
			||||||
  FlightRecorder::PrintEntireLog = 0;
 | 
					  FlightRecorder::PrintEntireLog = 0;
 | 
				
			||||||
  FlightRecorder::ChecksumComms  = 1;
 | 
					  FlightRecorder::ChecksumComms  = 0;
 | 
				
			||||||
  FlightRecorder::ChecksumCommsSend=0;
 | 
					  FlightRecorder::ChecksumCommsSend=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
 | 
					  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
 | 
				
			||||||
@@ -180,7 +186,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    iter ++;
 | 
					    iter ++;
 | 
				
			||||||
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
 | 
					    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
 | 
				
			||||||
  } while (now < (start + nsecs/10) );
 | 
					  } while (now < (start + nsecs/10) );
 | 
				
			||||||
    
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
					  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
				
			||||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
					  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
				
			||||||
  int i=0;
 | 
					  int i=0;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user