mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
2 Commits
b812a7b4c6
...
7e5bd46dd3
Author | SHA1 | Date | |
---|---|---|---|
|
7e5bd46dd3 | ||
|
228bbb9d81 |
@ -31,7 +31,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
||||
#include <hipblas/hipblas.h>
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <cublas_v2.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
#include <oneapi/mkl.hpp>
|
||||
@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
typedef hipblasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
typedef cudablasHandle_t gridblasHandle_t;
|
||||
typedef cublasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
@ -78,6 +78,7 @@ public:
|
||||
#ifdef GRID_CUDA
|
||||
std::cout << "cublasCreate"<<std::endl;
|
||||
cublasCreate(&gridblasHandle);
|
||||
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
std::cout << "hipblasCreate"<<std::endl;
|
||||
|
@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
||||
#endif
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
||||
#endif
|
||||
} else if( interior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
70
systems/Booster/benchmarks/Benchmark_usqcd.csv
Normal file
70
systems/Booster/benchmarks/Benchmark_usqcd.csv
Normal file
@ -0,0 +1,70 @@
|
||||
Memory Bandwidth
|
||||
|
||||
Bytes, GB/s per node
|
||||
3145728, 225.900365
|
||||
50331648, 2858.859504
|
||||
254803968, 4145.556367
|
||||
805306368, 4905.772480
|
||||
1966080000, 4978.312557
|
||||
|
||||
|
||||
GEMM
|
||||
|
||||
M, N, K, BATCH, GF/s per rank
|
||||
16, 8, 16, 256, 1.713639
|
||||
16, 16, 16, 256, 288.268316
|
||||
16, 32, 16, 256, 597.053950
|
||||
32, 8, 32, 256, 557.382591
|
||||
32, 16, 32, 256, 1100.145311
|
||||
32, 32, 32, 256, 1885.080449
|
||||
64, 8, 64, 256, 1725.163599
|
||||
64, 16, 64, 256, 3389.336566
|
||||
64, 32, 64, 256, 4168.252422
|
||||
16, 8, 256, 256, 1326.262134
|
||||
16, 16, 256, 256, 2318.095475
|
||||
16, 32, 256, 256, 3555.436503
|
||||
32, 8, 256, 256, 1920.139170
|
||||
32, 16, 256, 256, 3486.174753
|
||||
32, 32, 256, 256, 5320.821724
|
||||
64, 8, 256, 256, 2539.597502
|
||||
64, 16, 256, 256, 5003.456775
|
||||
64, 32, 256, 256, 7837.531562
|
||||
8, 256, 16, 256, 1427.848170
|
||||
16, 256, 16, 256, 2222.147815
|
||||
32, 256, 16, 256, 2877.121715
|
||||
8, 256, 32, 256, 1922.890086
|
||||
16, 256, 32, 256, 3199.469082
|
||||
32, 256, 32, 256, 4845.405343
|
||||
8, 256, 64, 256, 2639.483343
|
||||
16, 256, 64, 256, 5012.800299
|
||||
32, 256, 64, 256, 7216.006882
|
||||
|
||||
|
||||
|
||||
Communications
|
||||
|
||||
Packet bytes, direction, GB/s per node
|
||||
4718592, 2, 206.570734
|
||||
4718592, 3, 207.501847
|
||||
4718592, 6, 189.730277
|
||||
4718592, 7, 204.301218
|
||||
15925248, 2, 307.882997
|
||||
15925248, 3, 287.901076
|
||||
15925248, 6, 295.603109
|
||||
15925248, 7, 300.682033
|
||||
37748736, 2, 331.740364
|
||||
37748736, 3, 338.610627
|
||||
37748736, 6, 332.580657
|
||||
37748736, 7, 336.336579
|
||||
|
||||
|
||||
Per node summary table
|
||||
|
||||
L , Wilson, DWF4, Staggered, GF/s per node
|
||||
|
||||
8 , 16, 1165, 10
|
||||
12 , 473, 4901, 163
|
||||
16 , 1436, 8464, 442
|
||||
24 , 4133, 10139, 1530
|
||||
32 , 5726, 11487, 2518
|
||||
|
|
@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$LIME \
|
||||
--disable-accelerator-cshift \
|
||||
--enable-accelerator-cshift \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
module load GCC/9.3.0
|
||||
module load GMP/6.2.0
|
||||
module load MPFR/4.1.0
|
||||
module load OpenMPI/4.1.0rc1
|
||||
module load CUDA/11.3
|
||||
module load GCC
|
||||
module load GMP
|
||||
module load MPFR
|
||||
module load OpenMPI
|
||||
module load CUDA
|
||||
|
Loading…
Reference in New Issue
Block a user