Update configure

Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2025-06-11 11:56:56 +01:00 · 2024-10-15 14:38:45 +00:00 · 2024-10-15 14:36:06 +00:00 · 2024-10-15 14:35:17 +00:00 · 2024-10-15 14:32:35 +00:00 · 2024-10-15 14:32:11 +00:00
20 changed files with 267 additions and 73 deletions
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #ifdef GRID_ONE_MKL
-  typedef cl::sycl::queue *gridblasHandle_t;
+  typedef sycl::queue *gridblasHandle_t;
 #endif
 #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
  typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
      gridblasHandle = theGridAccelerator;
 #endif
 #ifdef GRID_ONE_MKL
-      cl::sycl::gpu_selector selector;
-      cl::sycl::device selectedDevice { selector };
-      cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
+      sycl::gpu_selector selector;
+      sycl::device selectedDevice { selector };
+      sycl::property_list q_prop{sycl::property::queue::in_order()};
      gridblasHandle =new sycl::queue (selectedDevice,q_prop);
 #endif
      gridblasInit=1;
--- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h
@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
      //Compute double precision rsd and also new RHS vector.
      Linop_d.HermOp(sol_d, tmp_d);
      RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
-      
+      std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;

      if(norm < OuterLoopNormMult * stop){
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 	break;
      }
-      while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+      while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??

      PrecChangeTimer.Start();
      precisionChange(src_f, src_d, pc_wk_dp_to_sp);
--- a/Grid/communicator/Communicator_base.cc
+++ b/Grid/communicator/Communicator_base.cc
@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////

+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(ComplexF &c)
+{
+  GlobalSumP2P(c);
+}
+void CartesianCommunicator::GlobalSum(ComplexD &c)
+{
+  GlobalSumP2P(c);
+}
+#else
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
-void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
-{
-  GlobalSumVector((float *)c,2*N);
-}
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
+#endif
+void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
+{
+  GlobalSumVector((float *)c,2*N);
+}
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -128,6 +128,34 @@ public:
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  
+  template<class obj> void GlobalSumP2P(obj &o)
+  {
+    std::vector<obj> column;
+    obj accum = o;
+    int source,dest;
+    for(int d=0;d<_ndimension;d++){
+      column.resize(_processors[d]);
+      column[0] = accum;
+      std::vector<CommsRequest_t> list;
+      for(int p=1;p<_processors[d];p++){
+	ShiftedRanks(d,p,source,dest);
+	SendToRecvFromBegin(list,
+			    &column[0],
+			    dest,
+			    &column[p],
+			    source,
+			    sizeof(obj),d*100+p);
+
+      }
+      CommsComplete(list);
+      for(int p=1;p<_processors[d];p++){
+	accum = accum + column[p];
+      }
+    }
+    Broadcast(0,accum);
+    o=accum;
+  }
+
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator()
    }
  }
 }
+#ifdef USE_GRID_REDUCTION
+void CartesianCommunicator::GlobalSum(float &f){
+  CartesianCommunicator::GlobalSumP2P(f);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  CartesianCommunicator::GlobalSumP2P(d);
+}
+#else
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+#endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(float &f){
-  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
-void CartesianCommunicator::GlobalSum(double &d)
-{
-  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
-  assert(ierr==0);
-}
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  });
 }

+#define FAST_AXPY_NORM
 template<class sobj,class vobj> inline
 RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpy_norm");
-    return axpy_norm_fast(ret,a,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpy_norm_fast(ret,a,x,y);
+#else
+  ret = a*x+y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }
 template<class sobj,class vobj> inline
 RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
 {
  GRID_TRACE("axpby_norm");
-    return axpby_norm_fast(ret,a,b,x,y);
+#ifdef FAST_AXPY_NORM
+  return axpby_norm_fast(ret,a,b,x,y);
+#else
+  ret = a*x+b*y;
+  RealD nn=norm2(ret);
+  return nn;
+#endif
 }

 /// Trace product
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -290,8 +290,10 @@ template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();

+  bool ok;
 #ifdef GRID_SYCL
  uint64_t csum=0;
+  uint64_t csum2=0;
  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
  {
    // Hack
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
    uint64_t *base= (uint64_t *)&l_v[0];
    csum=svm_xor(base,words);
+    ok = FlightRecorder::CsumLog(csum);
+    if ( !ok ) {
+      csum2=svm_xor(base,words);
+      std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    } else {
+      //      csum2=svm_xor(base,words);
+      //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+    }
+    assert(ok);
  }
-  FlightRecorder::CsumLog(csum);
 #endif
+  FlightRecorder::StepLog("rank inner product");
  ComplexD nrm = rankInnerProduct(left,right);
+  //  ComplexD nrmck=nrm;
  RealD local = real(nrm);
-  FlightRecorder::NormLog(real(nrm)); 
+  ok = FlightRecorder::NormLog(real(nrm));
+  if ( !ok ) {
+    ComplexD nrm2 = rankInnerProduct(left,right);
+    RealD local2 = real(nrm2);
+    std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
+    assert(ok);
+  }
+  FlightRecorder::StepLog("Start global sum");
+  //  grid->GlobalSumP2P(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::StepLog("Finished global sum");
+  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
  return nrm;
 }
@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
      coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
      coalescedWrite(z_v[ss],tmp);
  });
+  bool ok;
+  uint64_t csum=0;
+  uint64_t csum2=0;
+#ifdef GRID_SYCL
+  if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
+  {
+    // z_v
+    {
+      Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&z_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+    // inner_v
+    {
+      Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
+      uint64_t *base= (uint64_t *)&inner_tmp_v[0];
+      csum=svm_xor(base,words);
+      ok = FlightRecorder::CsumLog(csum);
+      if ( !ok ) {
+	csum2=svm_xor(base,words);
+	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
+      }
+      assert(ok);
+    }
+  }
+#endif
  nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
+  ok = FlightRecorder::NormLog(real(nrm));
+  assert(ok);
+  RealD local = real(nrm);
  grid->GlobalSum(nrm);
+  FlightRecorder::ReductionLog(local,real(nrm));
  return nrm; 
 }
 
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
  Integer nsimd= vobj::Nsimd();
  { 
    sycl::buffer<sobj, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
-      cgh.parallel_for(cl::sycl::range<1>{osites},
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(sycl::range<1>{osites},
                      Reduction,
-                      [=] (cl::sycl::id<1> item, auto &sum) {
+                      [=] (sycl::id<1> item, auto &sum) {
                        auto osite   = item[0];
                        sum +=Reduce(lat[osite]);
                      });
@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
  Word ret = 0;
  { 
    sycl::buffer<Word, 1> abuff(&ret, {1});
-    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
-      cgh.parallel_for(cl::sycl::range<1>{L},
+    theGridAccelerator->submit([&](sycl::handler &cgh) {
+      auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(sycl::range<1>{L},
                      Reduction,
-                      [=] (cl::sycl::id<1> index, auto &sum) {
+                      [=] (sycl::id<1> index, auto &sum) {
                        sum ^=vec[index];
                      });
    });
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
  });

  for (int r = 0; r < rd; r++) {
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+      theGridAccelerator->submit([&](sycl::handler &cgh) {
+          auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(sycl::range<1>{subvol_size},
          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
+          [=](sycl::id<1> item, auto &sum) {
              auto s = item[0];
              sum += rb_p[r*subvol_size+s];
          });
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -364,9 +364,10 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
-    accelerator_barrier();     // All kernels should ALREADY be complete
-    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
+    //    accelerator_barrier();     // All kernels should ALREADY be complete
+    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromBegin(MpiReqs,
@ -386,18 +387,20 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    FlightRecorder::StepLog("Start communicate complete");
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
-    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
-    accelerator_barrier(); 
+    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
+    //    accelerator_barrier(); 
    _grid->StencilBarrier(); 
    // run any checksums
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
    }
+    FlightRecorder::StepLog("Finish communicate complete");
  }
  ////////////////////////////////////////////////////////////////////////
  // Blocking send and receive. Either sequential or parallel.
@ -473,7 +476,7 @@ public:
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
-    accelerator_barrier();
+    //    accelerator_barrier();
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@ -487,7 +490,7 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    _grid->StencilBarrier();// Synch shared memory on a single nodes
+    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@ -202,13 +202,13 @@ void acceleratorInit(void)

 #ifdef GRID_SYCL

-cl::sycl::queue *theGridAccelerator;
-cl::sycl::queue *theCopyAccelerator;
+sycl::queue *theGridAccelerator;
+sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
-  //  cl::sycl::gpu_selector selector;
-  //  cl::sycl::device selectedDevice { selector };
+  //  sycl::gpu_selector selector;
+  //  sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
  theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
@ -242,14 +242,14 @@ void acceleratorInit(void)
  gethostname(hostname, HOST_NAME_MAX+1);
  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);

-  auto devices = cl::sycl::device::get_devices();
+  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){

 #define GPU_PROP_STR(prop) \
-    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
+    printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());

 #define GPU_PROP_FMT(prop,FMT) \
-    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
+    printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());

 #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld");
    if ( world_rank == 0) {
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -302,7 +302,7 @@ NAMESPACE_END(Grid);

 // Force deterministic reductions
 #define SYCL_REDUCTION_DETERMINISTIC
-#include <sycl/CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
@ -314,8 +314,8 @@ inline void acceleratorMem(void)
  std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
 }

-extern cl::sycl::queue *theGridAccelerator;
-extern cl::sycl::queue *theCopyAccelerator;
+extern sycl::queue *theGridAccelerator;
+extern sycl::queue *theCopyAccelerator;

 #ifdef __SYCL_DEVICE_ONLY__
 #define GRID_SIMT
@ -326,24 +326,24 @@ extern cl::sycl::queue *theCopyAccelerator;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
- return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; 
+ return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; 
 #else
 return 0;
 #endif
 } // SYCL specific

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
-  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
+  theGridAccelerator->submit([&](sycl::handler &cgh) {		\
    unsigned long nt=acceleratorThreads();				\
    if(nt < 8)nt=8;							\
    unsigned long unum1 = num1;						\
    unsigned long unum2 = num2;						\
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-    cl::sycl::range<3> local {nt,1,nsimd};				\
-    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    sycl::range<3> local {nt,1,nsimd};				\
+    sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
    cgh.parallel_for(							\
-		     cl::sycl::nd_range<3>(global,local),		\
-		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		     sycl::nd_range<3>(global,local),			\
+		     [=] (sycl::nd_item<3> item) /*mutable*/		\
 		     [[intel::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
-  auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
-  if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
+  auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
+  if ( uvm = sycl::usm::alloc::shared ) return 1;
  else return 0;
 #endif
  return 1;
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
 int FlightRecorder::LoggingMode;
 int FlightRecorder::ChecksumComms;
 int FlightRecorder::ChecksumCommsSend;
+const char *   FlightRecorder::StepName;
+int32_t  FlightRecorder::StepLoggingCounter;
 int32_t  FlightRecorder::XmitLoggingCounter;
 int32_t  FlightRecorder::RecvLoggingCounter;
 int32_t  FlightRecorder::CsumLoggingCounter;
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
  CsumLoggingCounter=0;
  NormLoggingCounter=0;
  ReductionLoggingCounter=0;
+  StepName = "No steps started";
+  StepLoggingCounter=0;
 }
 void FlightRecorder::Truncate(void)
 {
@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
    assert(0);
  }
 }
+bool FlightRecorder::StepLog(const char *name)
+{
+  StepName = name;
+  StepLoggingCounter ++;
+  return true;
+}

 void FlightRecorder::SetLoggingModePrint(void)
 {
@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
 {
  return ErrorCounter;
 }
-void FlightRecorder::NormLog(double value)
+bool FlightRecorder::NormLog(double value)
 {
  uint64_t hex = * ( (uint64_t *)&value );
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    NormLogVector.push_back(value);
    NormLoggingCounter++;
+    return true;
  }
  if(LoggingMode == LoggingModeVerify) {

@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)

      if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
 		NormLoggingCounter,NormLogVector.size(),
 		value, NormLogVector[NormLoggingCounter]); fflush(stderr);

-	if(!ContinueOnFail)assert(0); // Force takedown of job
+	BACKTRACEFP(stderr);
+
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;
      } else {
@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
    }
    NormLoggingCounter++;
  }
+  return true;
 }
-void FlightRecorder::CsumLog(uint64_t hex)
+bool FlightRecorder::CsumLog(uint64_t hex)
 {
  if(LoggingMode == LoggingModePrint) {
    std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeRecord) {
    std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
    CsumLogVector.push_back(hex);
    CsumLoggingCounter++;
+    return true;
  }

  if(LoggingMode == LoggingModeVerify) {
@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)

      if ( hex != hexref ) {

+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
        std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;

@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		CsumLoggingCounter,hex, hexref);
+	BACKTRACEFP(stderr);
 	fflush(stderr);

-	if(!ContinueOnFail) assert(0); // Force takedown of job
+	if(!ContinueOnFail) return false;
 	  
 	ErrorCounter++;

@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
    }
    CsumLoggingCounter++;
  }
+  return true;
 }
+
 void FlightRecorder::ReductionLog(double local,double global)
 {
  uint64_t hex_l = * ( (uint64_t *)&local );
@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
  if(LoggingMode == LoggingModeVerify) {
    if(ReductionLoggingCounter < ReductionLogVector.size()){
      if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		ReductionLoggingCounter,ReductionLogVector.size(),
 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@ -267,11 +297,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
  if(LoggingMode == LoggingModeVerify) {
    if(XmitLoggingCounter < XmitLogVector.size()){
      if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		XmitLoggingCounter,XmitLogVector.size(),
 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

@ -309,11 +343,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
  if(LoggingMode == LoggingModeVerify) {
    if(RecvLoggingCounter < RecvLogVector.size()){
      if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
+	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
+		FlightRecorder::StepLoggingCounter,
+		FlightRecorder::StepName);
 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n",
 		GridHostname(),
 		GlobalSharedMemory::WorldShmRank,
 		RecvLoggingCounter,RecvLogVector.size(),
 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
+	BACKTRACEFP(stderr);
 	
 	if ( !ContinueOnFail ) assert(0);

--- a/Grid/util/FlightRecorder.h
+++ b/Grid/util/FlightRecorder.h
@ -12,6 +12,8 @@ class FlightRecorder {
  
  static int                   LoggingMode;
  static uint64_t              ErrorCounter;
+  static const char *                StepName;
+  static int32_t               StepLoggingCounter;
  static int32_t               XmitLoggingCounter;
  static int32_t               RecvLoggingCounter;
  static int32_t               CsumLoggingCounter;
@ -30,8 +32,9 @@ class FlightRecorder {
  static void SetLoggingModeRecord(void);
  static void SetLoggingModeVerify(void);
  static void SetLoggingMode(LoggingMode_t mode);
-  static void NormLog(double value);
-  static void CsumLog(uint64_t csum);
+  static bool StepLog(const char *name);
+  static bool NormLog(double value);
+  static bool CsumLog(uint64_t csum);
  static void ReductionLog(double lcl, double glbl);
  static void Truncate(void);
  static void ResetCounters(void);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -552,6 +552,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
  fprintf(stderr,"Signal handler on host %s\n",hostname);
+  fprintf(stderr,"FlightRecorder step %d stage %s \n",
+	  FlightRecorder::StepLoggingCounter,
+	  FlightRecorder::StepName);
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
  fprintf(stderr,"         code %d\n",si->si_code);
--- a/configure.ac
+++ b/configure.ac
@ -128,6 +128,20 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
 esac

+############### internal reduction
+AC_ARG_ENABLE([reduction],
+    [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
+    [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
+
+case ${ac_REDUCTION} in
+    mpi)
+        ;;
+    grid)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+    *)
+        AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
+esac
+
 ############### tracing
 AC_ARG_ENABLE([tracing],
    [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@ -1,8 +1,14 @@
+#Ahead of time compile for PVC
+export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
+
+#JIT compile 
+#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
+#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "

-export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
-export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel  -fsycl -fno-exceptions "
 ../../configure \
 	--enable-simd=GPU \
+	--enable-reduction=grid \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-debug \
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@ -1,7 +1,8 @@
-module load oneapi/release/2023.12.15.001
+#module load oneapi/release/2023.12.15.001
 #module load mpich/icc-all-debug-pmix-gpu/52.2
 #module load mpich-config/mode/deterministic
 #module load intel_compute_runtime/release/821.35
+
 source ~/spack/share/spack/setup-env.sh 
 spack load c-lime
 spack load openssl
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@ -124,6 +124,8 @@ int main (int argc, char ** argv)

  SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
+  //  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
+  //  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);

  int nsecs=600;
  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
@ -131,6 +133,10 @@ int main (int argc, char ** argv)
    GridCmdOptionInt(arg,nsecs);
  }
  
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
+  UGrid->Barrier();
+  std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
+
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;

  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
@ -148,7 +154,7 @@ int main (int argc, char ** argv)

  FlightRecorder::ContinueOnFail = 0;
  FlightRecorder::PrintEntireLog = 0;
-  FlightRecorder::ChecksumComms  = 1;
+  FlightRecorder::ChecksumComms  = 0;
  FlightRecorder::ChecksumCommsSend=0;

  if(char *s=getenv("GRID_PRINT_ENTIRE_LOG"))  FlightRecorder::PrintEntireLog     = atoi(s);
@ -180,7 +186,7 @@ int main (int argc, char ** argv)
    iter ++;
    now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
  } while (now < (start + nsecs/10) );
-    
+
  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
  int i=0;
Author	SHA1	Message	Date
Peter Boyle	a78a61d76f	Update configure	2024-10-15 14:38:45 +00:00
Peter Boyle	2eff3f34ed	Alternate reduction; default to grids own but make a configure flag --enable-reduction=grid\|mpi	2024-10-15 14:36:06 +00:00
Peter Boyle	03687c1d62	Final version of test, closer to original again	2024-10-15 14:35:17 +00:00
Peter Boyle	febfe4e77f	Make my own reduction a configure flag	2024-10-15 14:32:35 +00:00
Peter Boyle	4d1aa134b5	Use normal reduction, configure flag to force deterministic	2024-10-15 14:32:11 +00:00
Peter Boyle	5ec879860a	Odd rounding issue - bears looking into	2024-10-15 14:30:54 +00:00
Peter Boyle	b728af903c	Fast axpy norm under CFLAG	2024-10-11 03:23:09 +00:00
Peter Boyle	54f1999030	axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues	2024-10-11 03:22:18 +00:00
Peter Boyle	fd58f0b669	Return ok	2024-10-11 03:21:21 +00:00
Peter Boyle	c5c67b706e	cl::sycl -> SYCL	2024-10-10 22:04:12 +00:00
Peter Boyle	be7a543e2c	Revert barriers -- these were not the problem	2024-10-10 22:03:29 +00:00
Peter Boyle	68f112d576	New software moves cl::sycl	2024-10-10 22:03:04 +00:00
Peter Boyle	ec1395a304	Better flight logging	2024-10-10 22:01:57 +00:00
Peter Boyle	beb0e474ee	Use deterministic own brand reduction	2024-10-10 22:01:24 +00:00
Peter Boyle	2b5fdcbbc5	New software version	2024-10-10 21:59:02 +00:00
Peter Boyle	295127d456	Deterministic homebrew reduction	2024-10-10 21:58:26 +00:00
Peter Boyle	7dcfb13694	New software stack	2024-10-10 21:57:35 +00:00