Booster update

Benchmark results
2025-11-15 02:59:32 +00:00 · 2024-03-06 19:03:45 +01:00 · 2024-03-06 19:03:35 +01:00
5 changed files with 83 additions and 20 deletions
--- a/Grid/algorithms/blas/BatchedBlas.h
+++ b/Grid/algorithms/blas/BatchedBlas.h
@@ -31,7 +31,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
-#include <hipblas/hipblas.h>
+#include <cublas_v2.h>
 #endif
 #ifdef GRID_SYCL
 #include <oneapi/mkl.hpp>
@@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid);
  typedef hipblasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_CUDA
-  typedef cudablasHandle_t gridblasHandle_t;
+  typedef cublasHandle_t gridblasHandle_t;
 #endif
 #ifdef GRID_SYCL
  typedef cl::sycl::queue *gridblasHandle_t;
@@ -78,6 +78,7 @@ public:
 #ifdef GRID_CUDA
      std::cout << "cublasCreate"<<std::endl;
      cublasCreate(&gridblasHandle);
+      cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
 #endif
 #ifdef GRID_HIP
      std::cout << "hipblasCreate"<<std::endl;
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,

  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}
+#ifndef GRID_CUDA
    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}
 #endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}
-#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}
-#endif
  }
  assert(0 && " Kernel optimisation case not covered ");
 }
@@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  
  if( interior && exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}
-#endif
  } else if( interior ) {
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}
-#endif
  } else if( exterior ) { 
    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
-#ifndef GRID_CUDA
    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}
-#endif
  }
 }

--- a/systems/Booster/benchmarks/Benchmark_usqcd.csv
+++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv
@@ -0,0 +1,70 @@
+Memory Bandwidth
+
+Bytes, GB/s per node
+3145728, 225.900365
+50331648, 2858.859504
+254803968, 4145.556367
+805306368, 4905.772480
+1966080000, 4978.312557
+
+
+GEMM
+
+ M, N, K, BATCH, GF/s per rank
+16, 8, 16, 256, 1.713639
+16, 16, 16, 256, 288.268316
+16, 32, 16, 256, 597.053950
+32, 8, 32, 256, 557.382591
+32, 16, 32, 256, 1100.145311
+32, 32, 32, 256, 1885.080449
+64, 8, 64, 256, 1725.163599
+64, 16, 64, 256, 3389.336566
+64, 32, 64, 256, 4168.252422
+16, 8, 256, 256, 1326.262134
+16, 16, 256, 256, 2318.095475
+16, 32, 256, 256, 3555.436503
+32, 8, 256, 256, 1920.139170
+32, 16, 256, 256, 3486.174753
+32, 32, 256, 256, 5320.821724
+64, 8, 256, 256, 2539.597502
+64, 16, 256, 256, 5003.456775
+64, 32, 256, 256, 7837.531562
+8, 256, 16, 256, 1427.848170
+16, 256, 16, 256, 2222.147815
+32, 256, 16, 256, 2877.121715
+8, 256, 32, 256, 1922.890086
+16, 256, 32, 256, 3199.469082
+32, 256, 32, 256, 4845.405343
+8, 256, 64, 256, 2639.483343
+16, 256, 64, 256, 5012.800299
+32, 256, 64, 256, 7216.006882
+
+
+
+Communications
+
+Packet bytes, direction, GB/s per node
+4718592, 2, 206.570734
+4718592, 3, 207.501847
+4718592, 6, 189.730277
+4718592, 7, 204.301218
+15925248, 2, 307.882997
+15925248, 3, 287.901076
+15925248, 6, 295.603109
+15925248, 7, 300.682033
+37748736, 2, 331.740364
+37748736, 3, 338.610627
+37748736, 6, 332.580657
+37748736, 7, 336.336579
+
+
+Per node summary table
+
+L , Wilson, DWF4, Staggered, GF/s per node
+
+8 , 16, 1165, 10
+12 , 473, 4901, 163
+16 , 1436, 8464, 442
+24 , 4133, 10139, 1530
+32 , 5726, 11487, 2518
+
--- a/systems/Booster/config-command
+++ b/systems/Booster/config-command
@@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
    --enable-gen-simd-width=64 \
    --enable-shm=nvlink \
    --enable-accelerator=cuda \
+    --disable-gparity \
+    --disable-fermion-reps \
    --with-lime=$LIME \
-    --disable-accelerator-cshift \
+    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
-    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
+    CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"

--- a/systems/Booster/sourceme.sh
+++ b/systems/Booster/sourceme.sh
@@ -1,5 +1,5 @@
-module load GCC/9.3.0       
-module load  GMP/6.2.0   
-module load MPFR/4.1.0     
-module load OpenMPI/4.1.0rc1  
-module load CUDA/11.3
+module load GCC
+module load GMP
+module load MPFR
+module load OpenMPI
+module load CUDA
Author	SHA1	Message	Date
Peter Boyle	7e5bd46dd3	Booster update	2024-03-06 19:03:45 +01:00
Peter Boyle	228bbb9d81	Benchmark results	2024-03-06 19:03:35 +01:00