Merge pull request #410 from gkanwar/photon_and_sha_patches

Photon.h and SHA256 patches
Remove FP16 tests when FP16 is disabled
2026-07-08 03:13:29 +01:00 · 2022-08-31 18:01:45 -04:00 · 2022-08-21 17:29:55 +02:00 · 2022-08-21 17:28:57 +02:00 · 2022-08-21 16:16:18 +02:00 · 2022-08-02 08:38:53 -07:00
20 changed files with 13838 additions and 10524 deletions
@@ -16,6 +16,7 @@
 #include <functional>
 #include <stdio.h>
 #include <stdlib.h>
+#include <strings.h>
 #include <stdio.h>
 #include <signal.h>
 #include <ctime>
@@ -262,7 +262,7 @@ public:
 	autoView( Tnp_v , (*Tnp), AcceleratorWrite);
 	autoView( Tnm_v , (*Tnm), AcceleratorWrite);
 	const int Nsimd = CComplex::Nsimd();
-	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
+	accelerator_for(ss, FineGrid->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
        });
@@ -264,7 +264,7 @@ public:
      auto Tnp_v = Tnp->View();
      auto Tnm_v = Tnm->View();
      constexpr int Nsimd = vector_type::Nsimd();
-      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+      accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
      });
@@ -392,9 +392,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
  }

-  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
-    this->StencilSendToRecvFromComplete(list,dir);
-  }
+  //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
+  //    this->StencilSendToRecvFromComplete(list,dir);
+  //  }

  return off_node_bytes;
 }
@@ -498,6 +498,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
+     acceleratorFenceComputeStream();
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
@@ -505,11 +506,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
+     acceleratorFenceComputeStream();
   }
   assert(0 && " Kernel optimisation case not covered ");
  }
@@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
    
    typedef Lattice<SiteLink>  LinkField;
    typedef Lattice<SiteField> Field;
-    typedef Field              ComplexField;
+    typedef LinkField          ComplexField;
  };
  
  typedef QedGImpl<vComplex> QedGImplR;
@@ -26,7 +26,7 @@
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#ifndef GRID_HIP

 NAMESPACE_BEGIN(Grid);

@@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s,	const std::string &x)
  if (s.size())
    ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
  else
-    ss_ << os.str() << " ," ;
+    ss_ << "\""<< os.str() << "\" ," ;
 }

 // Reader implementation ///////////////////////////////////////////////////////
@@ -54,7 +54,7 @@ namespace Grid
    void pop(void);
    template <typename U>
    void writeDefault(const std::string &s, const U &x);
-#ifdef __NVCC__
+#if defined(GRID_CUDA) || defined(GRID_HIP)
    void writeDefault(const std::string &s, const Grid::ComplexD &x) 
    { 
      std::complex<double> z(real(x),imag(x));
@@ -101,7 +101,7 @@ namespace Grid
    void readDefault(const std::string &s, std::vector<U> &output);
    template <typename U, typename P>
    void readDefault(const std::string &s, std::pair<U,P> &output);
-#ifdef __NVCC__
+#if defined(GRID_CUDA) || defined(GRID_HIP)
    void readDefault(const std::string &s, ComplexD &output)
    { 
      std::complex<double> z;
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "BinaryIO.h"
 #include "TextIO.h"
 #include "XmlIO.h"
-#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
+#ifndef GRID_HIP
 #include "JSON_IO.h"
 #endif

@@ -195,12 +195,15 @@ void acceleratorInit(void)
 #ifdef GRID_SYCL

 cl::sycl::queue *theGridAccelerator;
+cl::sycl::queue *theCopyAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
+  //  theCopyAccelerator = new sycl::queue (selectedDevice);
+  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.

 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
  zeInit(0);
@@ -247,7 +247,6 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 // SyCL acceleration
 //////////////////////////////////////////////
-
 #ifdef GRID_SYCL
 NAMESPACE_END(Grid);
 #include <CL/sycl.hpp>
@@ -262,6 +261,7 @@ NAMESPACE_END(Grid);
 NAMESPACE_BEGIN(Grid);

 extern cl::sycl::queue *theGridAccelerator;
+extern cl::sycl::queue *theCopyAccelerator;

 #ifdef __SYCL_DEVICE_ONLY__
 #define GRID_SIMT
@@ -289,7 +289,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      cgh.parallel_for(					\
      cl::sycl::nd_range<3>(global,local), \
      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
-      [[intel::reqd_sub_group_size(8)]]	      \
+      [[intel::reqd_sub_group_size(16)]]	      \
      {						      \
      auto iter1    = item.get_global_id(0);	      \
      auto iter2    = item.get_global_id(1);	      \
@@ -298,19 +298,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
     });	   			              \
    });

-#define accelerator_barrier(dummy) theGridAccelerator->wait();
+#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n");  theGridAccelerator->wait(); }

 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {
-  theGridAccelerator->memcpy(to,from,bytes);
-}
-inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
-inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
-inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
-inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
+
+inline void acceleratorCopySynchronise(void) {  printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
+inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
+
 inline int  acceleratorIsCommunicable(void *ptr)
 {
 #if 0
@@ -511,7 +511,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
 inline void acceleratorFreeCpu  (void *ptr){free(ptr);};
 #endif

+//////////////////////////////////////////////
+// Fencing needed ONLY for SYCL
+//////////////////////////////////////////////

+#ifdef GRID_SYCL
+inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
+#else
+// Ordering within a stream guaranteed on Nvidia & AMD
+inline void acceleratorFenceComputeStream(void){ };
+#endif

 ///////////////////////////////////////////////////
 // Synchronise across local threads for divergence resynch
@@ -27,6 +27,7 @@
    /*  END LEGAL */
 extern "C" {
 #include <openssl/sha.h>
+#include <openssl/evp.h>
 }
 #ifdef USE_IPP
 #include "ipp.h"
@@ -70,10 +71,8 @@ public:
  static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
  {
    std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
-    SHA256_CTX sha256;
-    SHA256_Init  (&sha256);
-    SHA256_Update(&sha256, data,bytes);
-    SHA256_Final (&hash[0], &sha256);
+    auto digest = EVP_get_digestbyname("SHA256");
+    EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL);
    return hash;
  }
  static inline std::vector<int> sha256_seeds(const std::string &s)
@@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
+- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes).
 - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
@@ -0,0 +1,62 @@
+#!/bin/sh
+##SBATCH -p PVC-SPR-QZEH 
+##SBATCH -p PVC-ICX-QZNW
+#SBATCH -p QZ1J-ICX-PVC
+##SBATCH -p QZ1J-SPR-PVC-2C
+
+source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+
+export NT=16
+
+export I_MPI_OFFLOAD=1
+export I_MPI_OFFLOAD_TOPOLIB=level_zero
+export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
+
+# export IGC_EnableLSCFenceUGMBeforeEOT=0
+# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
+export SYCL_DEVICE_FILTER=gpu,level_zero
+#export IGC_ShaderDumpEnable=1 
+#export IGC_DumpToCurrentDir=1
+export I_MPI_OFFLOAD_CELL=tile
+export EnableImplicitScaling=0
+export EnableWalkerPartition=0
+export ZE_AFFINITY_MASK=0.0
+mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
+
+export ZE_AFFINITY_MASK=0
+export I_MPI_OFFLOAD_CELL=device
+export EnableImplicitScaling=1
+export EnableWalkerPartition=1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#mpiexec -launcher ssh -n 2 -host localhost  vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1
+
+#mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
+
+#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0
+#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1
+
@@ -0,0 +1,26 @@
+#!/bin/bash
+##SBATCH -p PVC-SPR-QZEH 
+##SBATCH -p PVC-ICX-QZNW
+#SBATCH -p QZ1J-ICX-PVC
+
+source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+
+export NT=16
+
+
+# export IGC_EnableLSCFenceUGMBeforeEOT=0
+# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
+#export IGC_ShaderDumpEnable=1 
+#export IGC_DumpToCurrentDir=1
+export I_MPI_OFFLOAD=1
+export I_MPI_OFFLOAD_TOPOLIB=level_zero
+export I_MPI_OFFLOAD_DOMAIN_SIZE=-1
+export SYCL_DEVICE_FILTER=gpu,level_zero
+export I_MPI_OFFLOAD_CELL=tile
+export EnableImplicitScaling=0
+export EnableWalkerPartition=0
+
+mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
+
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
+
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
+
+echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
+
+
+if [ $MPI_LOCALRANKID = "0" ] 
+then
+#  ~psteinbr/build_pti/ze_tracer -c $@
+  onetrace --chrome-kernel-timeline $@
+else
+  $@
+fi
@@ -0,0 +1,15 @@
+INSTALL=/nfs/site/home/azusayax/install
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi \
+	--disable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=yes \
+	CXX=mpicxx \
+	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
+	CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+
@@ -0,0 +1,11 @@
+export https_proxy=http://proxy-chain.intel.com:911
+export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+
+module load intel-release
+source /opt/intel/oneapi/PVC_setup.sh
+#source /opt/intel/oneapi/ATS_setup.sh
+module load intel/mpich/pvc45.3
+export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH
+
+#clsh embargo-ci-neo-022845
+#source /opt/intel/vtune_amplifier/amplxe-vars.sh
@@ -793,6 +793,7 @@ int main (int argc, char ** argv)
    }
    std::cout <<" OK ! "<<std::endl;

+#ifdef USE_FP16
    // Double to Half
    std::cout << GridLogMessage<< "Double to half" ;
    precisionChange(&H[0],&D[0],Ndp);
@@ -822,6 +823,7 @@ int main (int argc, char ** argv)
      assert( tmp < 1.0e-3 );
    }
    std::cout <<" OK ! "<<std::endl;
+#endif

  }
  Grid_finalize();
Author	SHA1	Message	Date
Peter Boyle	913fbca74a	Merge pull request #410 from gkanwar/photon_and_sha_patches Photon.h and SHA256 patches	2022-08-31 18:01:45 -04:00
Gurtej Kanwar	60dfb49afa	Remove FP16 tests when FP16 is disabled	2022-08-21 17:29:55 +02:00
Gurtej Kanwar	554c238359	Update OpenSSL digest to use high-level methods This avoids deprecation warnings when compiling against OpenSSL 3.0 but should still be backwards compatible. It is the recommended way to use the digest API going forward.	2022-08-21 17:28:57 +02:00
Gurtej Kanwar	f922adf05e	Fix Photon ComplexField type	2022-08-21 16:16:18 +02:00
Peter Boyle	188d2c7a4d	PVC default, ignore ATS	2022-08-02 08:38:53 -07:00
Peter Boyle	17d7177105	Files for SYCL	2022-08-02 08:33:39 -07:00
Peter Boyle	bb0a0da47a	inon blocking caution due to SYCL	2022-08-02 08:09:43 -07:00
Peter Boyle	84110166e4	Fix the fence	2022-08-02 08:00:43 -07:00
Peter Boyle	d32b923b6c	Fencing on a stream in SYCL is needed. Didn't know that ... gulp	2022-08-02 07:58:04 -07:00
Peter Boyle	2ab1af5754	Ensure no synchronize and not optoin dependent	2022-07-19 09:51:06 -07:00
Peter Boyle	5f8892bf03	Mistake pointed out by Camilo	2022-07-19 09:31:51 -07:00
Peter Boyle	f14e7e51e7	Grid accelerator	2022-07-12 10:56:22 -07:00
Peter Boyle	042ab1a052	Update GridStd.h	2022-06-27 13:21:39 -04:00
Peter Boyle	2df98a99bc	Merge pull request #406 from giordano/patch-1 Update default value of gen-simd-width in README	2022-06-14 17:46:25 -04:00
Mosè Giordano	315ea18be2	Update default value of gen-simd-width in README	2022-06-14 22:41:05 +01:00
Peter Boyle	a9c2e1df03	Merge pull request #404 from rrhodgson/feature/json_nvcc Feature/json nvcc	2022-05-25 13:30:11 -04:00
rhodgson	da4daea57a	Updated json to latest release 3.10.5	2022-05-24 16:16:06 +01:00
rhodgson	e346154c5d	Updated json CUDA compile guards	2022-05-24 15:48:01 +01:00
rhodgson	3ca0de1c40	Fix json write for vector<string>	2022-05-24 14:37:33 +01:00
rhodgson	c7205d2a73	Removed nvcc guards for json	2022-05-24 14:30:26 +01:00