diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 5ab46333..f6418b7e 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -31,7 +31,7 @@ Author: Peter Boyle #include #endif #ifdef GRID_CUDA -#include +#include #endif #ifdef GRID_SYCL #include @@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid); typedef hipblasHandle_t gridblasHandle_t; #endif #ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; + typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL typedef cl::sycl::queue *gridblasHandle_t; @@ -78,6 +78,7 @@ public: #ifdef GRID_CUDA std::cout << "cublasCreate"<::DhopImproved(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} +#ifndef GRID_CUDA if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} #endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} -#endif } assert(0 && " Kernel optimisation case not covered "); } @@ -322,19 +318,13 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} -#endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} -#endif } } diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index b47c240c..c1466b45 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts) } #endif +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// diff --git a/systems/Booster/benchmarks/Benchmark_usqcd.csv b/systems/Booster/benchmarks/Benchmark_usqcd.csv new file mode 100644 index 00000000..68689deb --- /dev/null +++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv @@ -0,0 +1,70 @@ +Memory Bandwidth + +Bytes, GB/s per node +3145728, 225.900365 +50331648, 2858.859504 +254803968, 4145.556367 +805306368, 4905.772480 +1966080000, 4978.312557 + + +GEMM + + M, N, K, BATCH, GF/s per rank +16, 8, 16, 256, 1.713639 +16, 16, 16, 256, 288.268316 +16, 32, 16, 256, 597.053950 +32, 8, 32, 256, 557.382591 +32, 16, 32, 256, 1100.145311 +32, 32, 32, 256, 1885.080449 +64, 8, 64, 256, 1725.163599 +64, 16, 64, 256, 3389.336566 +64, 32, 64, 256, 4168.252422 +16, 8, 256, 256, 1326.262134 +16, 16, 256, 256, 2318.095475 +16, 32, 256, 256, 3555.436503 +32, 8, 256, 256, 1920.139170 +32, 16, 256, 256, 3486.174753 +32, 32, 256, 256, 5320.821724 +64, 8, 256, 256, 2539.597502 +64, 16, 256, 256, 5003.456775 +64, 32, 256, 256, 7837.531562 +8, 256, 16, 256, 1427.848170 +16, 256, 16, 256, 2222.147815 +32, 256, 16, 256, 2877.121715 +8, 256, 32, 256, 1922.890086 +16, 256, 32, 256, 3199.469082 +32, 256, 32, 256, 4845.405343 +8, 256, 64, 256, 2639.483343 +16, 256, 64, 256, 5012.800299 +32, 256, 64, 256, 7216.006882 + + + +Communications + +Packet bytes, direction, GB/s per node +4718592, 2, 206.570734 +4718592, 3, 207.501847 +4718592, 6, 189.730277 +4718592, 7, 204.301218 +15925248, 2, 307.882997 +15925248, 3, 287.901076 +15925248, 6, 295.603109 +15925248, 7, 300.682033 +37748736, 2, 331.740364 +37748736, 3, 338.610627 +37748736, 6, 332.580657 +37748736, 7, 336.336579 + + +Per node summary table + +L , Wilson, DWF4, Staggered, GF/s per node + +8 , 16, 1165, 10 +12 , 473, 4901, 163 +16 , 1436, 8464, 442 +24 , 4133, 10139, 1530 +32 , 5726, 11487, 2518 + diff --git a/systems/Booster/config-command b/systems/Booster/config-command index 8530c5f9..1ba2dc7a 100644 --- a/systems/Booster/config-command +++ b/systems/Booster/config-command @@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/ --enable-gen-simd-width=64 \ --enable-shm=nvlink \ --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ --with-lime=$LIME \ - --disable-accelerator-cshift \ + --enable-accelerator-cshift \ --disable-unified \ CXX=nvcc \ LDFLAGS="-cudart shared " \ - CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas" diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh index 56499be4..2341267f 100644 --- a/systems/Booster/sourceme.sh +++ b/systems/Booster/sourceme.sh @@ -1,5 +1,5 @@ -module load GCC/9.3.0 -module load GMP/6.2.0 -module load MPFR/4.1.0 -module load OpenMPI/4.1.0rc1 -module load CUDA/11.3 +module load GCC +module load GMP +module load MPFR +module load OpenMPI +module load CUDA diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index fb1fa59a..e5f32ab5 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,6 +30,10 @@ Author: Peter Boyle using namespace std; using namespace Grid; +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + int main (int argc, char ** argv) { char hostname[HOST_NAME_MAX+1];