Remove nofma

SYCL version update. Why do they keep making incompatible changes
Warning remove
2025-06-21 17:22:03 +01:00 · 2023-03-14 12:10:31 -07:00 · 2023-03-14 12:10:02 -07:00 · 2023-03-14 12:09:26 -07:00 · 2023-03-14 12:09:00 -07:00 · 2023-03-14 09:10:27 -07:00
10 changed files with 126 additions and 24 deletions
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@ -13,7 +13,7 @@ uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;

-void  MemoryManager::Audit(void){};
+void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -29,6 +29,7 @@ Author: Christoph Lehner <christoph@lhnr.de>

 #include <Grid/GridCore.h>
 #include <pwd.h>
+#include <syscall.h>

 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@ -52,13 +52,6 @@ public:
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:

-#ifdef GRID_SYCL
-#define SYCL_HACK
-#endif  
-#ifdef SYCL_HACK
-  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
-			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
-#endif
  
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
+  
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -433,7 +433,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    });									

 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
+    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -290,9 +290,9 @@ public:

 protected:
  GridBase *                        _grid;
-
 public:
  GridBase *Grid(void) const { return _grid; }
+  LebesgueOrder *lo;

  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
@ -337,6 +337,7 @@ public:
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
+#ifdef SHM_FAST_PATH
  inline int SameNode(int point) {

    int dimension    = this->_directions[point];
@ -356,7 +357,40 @@ public:
    if ( displacement == 0 ) return 1;
    return 0;
  }
+#else
+  // 
+  inline int SameNode(int point) {

+    int dimension    = this->_directions[point];
+    int displacement = this->_distances[point];
+
+    int pd              = _grid->_processors[dimension];
+    int fd              = _grid->_fdimensions[dimension];
+    int ld              = _grid->_ldimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+ 
+    int recv_from_rank;
+    int xmit_to_rank;
+
+    if ( ! comm_dim ) return 1;
+
+    int nbr_proc;
+    if (displacement>0) nbr_proc = 1;
+    else                 nbr_proc = pd-1;
+
+    // FIXME  this logic needs to be sorted for three link term
+    //    assert( (displacement==1) || (displacement==-1));
+    // Present hack only works for >= 4^4 subvol per node
+    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
+
+    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
+
+    if ( shm==NULL ) return 0;
+    return 1;
+  }
+#endif
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  // Use OpenMP Tasks for cleaner ???
@ -1056,7 +1090,7 @@ public:
    int comms_recv   = this->_comms_recv[point];
    int comms_partial_send   = this->_comms_partial_send[point] ;
    int comms_partial_recv   = this->_comms_partial_recv[point] ;
-
+    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());

@ -1127,11 +1161,32 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}

+	// potential SHM fast path for intranode
+	int shm_send=0;
+	int shm_recv=0;
+#ifdef SHM_FAST_PATH
+	// Put directly in place if we can
+	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
+	if ( (send_buf==NULL) ) {
+	  shm_send=0;
+	  send_buf = this->u_send_buf_p;
+	} else {
+	  shm_send=1;
+	}
+	void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
+	if ( test_ptr != NULL ) shm_recv = 1;
+	//	static int printed;
+	//	if (!printed){
+	  //	  std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	//	  printed = 1;
+	//	}
+#else
 	////////////////////////////////////////////////////////
 	// Gather locally
 	////////////////////////////////////////////////////////
 	send_buf = this->u_send_buf_p; // Gather locally, must send
 	assert(send_buf!=NULL);
+#endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
@ -1143,10 +1198,13 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
+	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
-		    xmit_to_rank, comms_send|comms_partial_send,
-		    recv_from_rank, comms_recv|comms_partial_recv,
+		    xmit_to_rank, do_send,
+		    recv_from_rank, do_recv,
 		    xbytes,rbytes);
 	}

@ -1288,19 +1346,47 @@ public:

 	    int recv_from_rank;
 	    int xmit_to_rank;
-
+	    int shm_send=0;
+	    int shm_recv=0;
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
-
+#ifdef SHM_FAST_PATH
+  #warning STENCIL SHM FAST PATH SELECTED
+	    // shm == receive pointer         if offnode
+	    // shm == Translate[send pointer] if on node -- my view of his send pointer
+	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if (shm==NULL) {
+	      shm = rp;
+	      // we found a packet that comes from MPI and contributes to this shift.
+	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
+	      // Kernel will add the exterior_terms except if is_same_node.
+	      // leg of stencil
+	      shm_recv=0;
+	    } else {
+	      shm_recv=1;
+	    }
+	    rpointers[i] = shm;
+	    // Test send side
+	    void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
+	    if ( test_ptr != NULL ) shm_send = 1;
+	    //	    static int printed;
+	    //	    if (!printed){
+	    //	      std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	    //	      printed = 1;
+	    //	    }
+#else
 	    rpointers[i] = rp;
+#endif
 	    
 	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
 	    if ( !duplicate  ) { 
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
+	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	      AddPacket((void *)sp,(void *)rp,
-			xmit_to_rank,comms_send|comms_partial_send,
-			recv_from_rank,comms_recv|comms_partial_recv,
+			xmit_to_rank,do_send,
+			recv_from_rank,do_send,
 			xbytes,rbytes);
 	    }

@ -1310,7 +1396,7 @@ public:

 	  }
 	}
-
+	// rpointer may be doing a remote read in the gather over SHM
 	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -248,17 +248,23 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 // SyCL acceleration
 //////////////////////////////////////////////
-#ifdef GRID_SYCL
-NAMESPACE_END(Grid);
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>

+#ifdef GRID_SYCL
 #define GRID_SYCL_LEVEL_ZERO_IPC

-#ifdef GRID_SYCL_LEVEL_ZERO_IPC
+NAMESPACE_END(Grid);
+#if 0
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
+#else
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
+
 NAMESPACE_BEGIN(Grid);

 extern cl::sycl::queue *theGridAccelerator;
--- a/configure.ac
+++ b/configure.ac
@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in
      ;;
     *) ;;
 esac
+############### force MPI in SMP
+AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no])
+case ${ac_SHM_FAST_PATH} in
+     yes)
+        AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] )
+      ;;
+     *) ;;
+esac

 ############### communication type selection
 AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
--- a/systems/PVC/config-command
+++ b/systems/PVC/config-command
@ -11,5 +11,5 @@ INSTALL=/nfs/site/home/azusayax/install
 	--enable-unified=yes \
 	CXX=mpicxx \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+	CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare"
Author	SHA1	Message	Date
Peter Boyle	a997d24743	Remove nofma	2023-03-14 12:10:31 -07:00
Peter Boyle	861e5d7f4c	SYCL version update. Why do they keep making incompatible changes	2023-03-14 12:10:02 -07:00
Peter Boyle	14cc142a14	Warning remove	2023-03-14 12:09:26 -07:00
Peter Boyle	f36b87deb5	syscall fix	2023-03-14 12:09:00 -07:00
Peter Boyle	eeb6e0a6e3	Renable cache blocking and efficient UPI type SHM comms	2023-03-14 09:10:27 -07:00
Peter Boyle	cad5b187dd	Cleanup	2023-03-14 09:08:16 -07:00
Peter Boyle	87697eb07e	SHared compile	2023-03-14 09:07:36 -07:00