mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Perlmutter tune up
This commit is contained in:
parent
b2ccaad761
commit
c0d56a1c04
@ -389,7 +389,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
||||
acceleratorCopySynchronize(); // MPI prob slower
|
||||
acceleratorCopySynchronise(); // MPI prob slower
|
||||
}
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
|
129
systems/Perlmutter/comms.4node
Normal file
129
systems/Perlmutter/comms.4node
Normal file
@ -0,0 +1,129 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x7f8d40000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 1073741824 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.956704 s : Grid is setup to use 32 threads
|
||||
Grid : Message : 0.956709 s : Number of iterations to average: 250
|
||||
Grid : Message : 0.956712 s : ====================================================================================================
|
||||
Grid : Message : 0.956713 s : = Benchmarking sequential halo exchange from host memory
|
||||
Grid : Message : 0.956714 s : ====================================================================================================
|
||||
Grid : Message : 0.956715 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 1.108420 s : 8 8 393216 15427.2 30854.4
|
||||
Grid : Message : 1.198740 s : 8 8 393216 87332.8 174665.6
|
||||
Grid : Message : 1.574400 s : 8 8 393216 20938.0 41876.0
|
||||
Grid : Message : 1.956280 s : 8 8 393216 20598.0 41196.0
|
||||
Grid : Message : 1.125254 s : 12 8 1327104 105614.9 211229.8
|
||||
Grid : Message : 1.149709 s : 12 8 1327104 108578.8 217157.5
|
||||
Grid : Message : 1.262612 s : 12 8 1327104 23510.2 47020.4
|
||||
Grid : Message : 1.377804 s : 12 8 1327104 23043.0 46086.0
|
||||
Grid : Message : 1.445986 s : 16 8 3145728 107931.9 215863.7
|
||||
Grid : Message : 1.501495 s : 16 8 3145728 113380.0 226760.0
|
||||
Grid : Message : 1.766377 s : 16 8 3145728 23752.8 47505.6
|
||||
Grid : Message : 2.301720 s : 16 8 3145728 23850.6 47701.2
|
||||
Grid : Message : 2.158035 s : 20 8 6144000 109657.5 219315.0
|
||||
Grid : Message : 2.268232 s : 20 8 6144000 111535.7 223071.4
|
||||
Grid : Message : 2.779996 s : 20 8 6144000 24011.8 48023.6
|
||||
Grid : Message : 3.289081 s : 20 8 6144000 24137.8 48275.7
|
||||
Grid : Message : 3.549101 s : 24 8 10616832 89696.1 179392.2
|
||||
Grid : Message : 3.779416 s : 24 8 10616832 92205.2 184410.4
|
||||
Grid : Message : 4.656539 s : 24 8 10616832 24209.0 48417.9
|
||||
Grid : Message : 5.531893 s : 24 8 10616832 24257.5 48515.0
|
||||
Grid : Message : 6.800400 s : 28 8 16859136 76106.8 152213.6
|
||||
Grid : Message : 6.443946 s : 28 8 16859136 77350.6 154701.1
|
||||
Grid : Message : 7.830994 s : 28 8 16859136 24309.8 48619.6
|
||||
Grid : Message : 9.215301 s : 28 8 16859136 24357.8 48715.5
|
||||
Grid : Message : 9.955615 s : 32 8 25165824 72403.7 144807.4
|
||||
Grid : Message : 10.648284 s : 32 8 25165824 72666.2 145332.4
|
||||
Grid : Message : 12.713098 s : 32 8 25165824 24376.2 48752.3
|
||||
Grid : Message : 14.775577 s : 32 8 25165824 24403.6 48807.3
|
||||
Grid : Message : 14.777794 s : ====================================================================================================
|
||||
Grid : Message : 14.777799 s : = Benchmarking sequential halo exchange from GPU memory
|
||||
Grid : Message : 14.777800 s : ====================================================================================================
|
||||
Grid : Message : 14.777801 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 14.798392 s : 8 8 393216 49210.4 98420.9
|
||||
Grid : Message : 14.812519 s : 8 8 393216 55716.0 111432.1
|
||||
Grid : Message : 14.861908 s : 8 8 393216 15926.4 31852.9
|
||||
Grid : Message : 14.909307 s : 8 8 393216 16594.5 33189.1
|
||||
Grid : Message : 14.938366 s : 12 8 1327104 157435.7 314871.3
|
||||
Grid : Message : 14.954490 s : 12 8 1327104 164724.6 329449.3
|
||||
Grid : Message : 15.921650 s : 12 8 1327104 19280.2 38560.4
|
||||
Grid : Message : 15.229618 s : 12 8 1327104 19311.3 38622.7
|
||||
Grid : Message : 15.275707 s : 16 8 3145728 221257.5 442514.9
|
||||
Grid : Message : 15.303489 s : 16 8 3145728 226547.7 453095.4
|
||||
Grid : Message : 15.619610 s : 16 8 3145728 19902.6 39805.2
|
||||
Grid : Message : 15.935287 s : 16 8 3145728 19930.6 39861.2
|
||||
Grid : Message : 15.999038 s : 20 8 6144000 269586.0 539172.0
|
||||
Grid : Message : 16.435890 s : 20 8 6144000 275886.8 551773.7
|
||||
Grid : Message : 16.652349 s : 20 8 6144000 20185.6 40371.2
|
||||
Grid : Message : 17.262005 s : 20 8 6144000 20156.0 40311.9
|
||||
Grid : Message : 17.351417 s : 24 8 10616832 300428.2 600856.4
|
||||
Grid : Message : 17.421125 s : 24 8 10616832 304656.8 609313.6
|
||||
Grid : Message : 18.477072 s : 24 8 10616832 20108.9 40217.7
|
||||
Grid : Message : 19.556481 s : 24 8 10616832 19671.8 39343.6
|
||||
Grid : Message : 19.681365 s : 28 8 16859136 318966.5 637933.1
|
||||
Grid : Message : 19.786400 s : 28 8 16859136 321056.1 642112.1
|
||||
Grid : Message : 21.531557 s : 28 8 16859136 19321.2 38642.4
|
||||
Grid : Message : 23.384312 s : 28 8 16859136 18199.2 36398.3
|
||||
Grid : Message : 23.556358 s : 32 8 25165824 332397.6 664795.2
|
||||
Grid : Message : 23.706392 s : 32 8 25165824 335492.9 670985.8
|
||||
Grid : Message : 26.356425 s : 32 8 25165824 18992.9 37985.9
|
||||
Grid : Message : 29.126692 s : 32 8 25165824 18168.6 36337.3
|
||||
Grid : Message : 29.137480 s : ====================================================================================================
|
||||
Grid : Message : 29.137485 s : = All done; Bye Bye
|
||||
Grid : Message : 29.137486 s : ====================================================================================================
|
12
systems/Perlmutter/config-command
Normal file
12
systems/Perlmutter/config-command
Normal file
@ -0,0 +1,12 @@
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-shm=nvlink \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-fermion-reps \
|
||||
--disable-unified \
|
||||
--disable-gparity \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin CC -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt0
Normal file
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt0
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fc320000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.762377 s : Grid Layout
|
||||
Grid : Message : 0.762378 s : Global lattice size : 48 48 48 48
|
||||
Grid : Message : 0.762381 s : OpenMP threads : 32
|
||||
Grid : Message : 0.762382 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.790912 s : Making s innermost grids
|
||||
Grid : Message : 0.817408 s : Initialising 4d RNG
|
||||
Grid : Message : 0.840908 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.840921 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.911684 s : Initialising 5d RNG
|
||||
Grid : Message : 1.270530 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.270544 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 1.568435 s : Initialised RNGs
|
||||
Grid : Message : 2.241446 s : Drawing gauge field
|
||||
Grid : Message : 2.318921 s : Random gauge initialised
|
||||
Grid : Message : 2.779258 s : Setting up Cshift based reference
|
||||
Grid : Message : 3.188306 s : *****************************************************************
|
||||
Grid : Message : 3.188315 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 3.188316 s : *****************************************************************
|
||||
Grid : Message : 3.188316 s : *****************************************************************
|
||||
Grid : Message : 3.188316 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 3.188316 s : * Vectorising space-time by 8
|
||||
Grid : Message : 3.188317 s : * VComplexF size is 64 B
|
||||
Grid : Message : 3.188318 s : * SINGLE precision
|
||||
Grid : Message : 3.188318 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 3.188318 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 3.188318 s : *****************************************************************
|
||||
Grid : Message : 3.548355 s : Called warmup
|
||||
Grid : Message : 37.809000 s : Called Dw 3000 times in 3.42606e+07 us
|
||||
Grid : Message : 37.809040 s : mflop/s = 9.81714e+06
|
||||
Grid : Message : 37.809042 s : mflop/s per rank = 613572
|
||||
Grid : Message : 37.809043 s : mflop/s per node = 2.45429e+06
|
||||
Grid : Message : 37.809044 s : RF GiB/s (base 2) = 19948.2
|
||||
Grid : Message : 37.809045 s : mem GiB/s (base 2) = 12467.6
|
||||
Grid : Message : 37.810181 s : norm diff 1.03662e-13
|
||||
Grid : Message : 37.824163 s : #### Dhop calls report
|
||||
Grid : Message : 37.824168 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 37.824172 s : WilsonFermion5D TotalTime /Calls : 5719.36 us
|
||||
Grid : Message : 37.824173 s : WilsonFermion5D CommTime /Calls : 5085.34 us
|
||||
Grid : Message : 37.824174 s : WilsonFermion5D FaceTime /Calls : 265.445 us
|
||||
Grid : Message : 37.824175 s : WilsonFermion5D ComputeTime1/Calls : 23.4602 us
|
||||
Grid : Message : 37.824176 s : WilsonFermion5D ComputeTime2/Calls : 370.89 us
|
||||
Grid : Message : 37.824191 s : Average mflops/s per call : 2.36923e+09
|
||||
Grid : Message : 37.824194 s : Average mflops/s per call per rank : 1.48077e+08
|
||||
Grid : Message : 37.824195 s : Average mflops/s per call per node : 5.92307e+08
|
||||
Grid : Message : 37.824196 s : Average mflops/s per call (full) : 9.97945e+06
|
||||
Grid : Message : 37.824197 s : Average mflops/s per call per rank (full): 623716
|
||||
Grid : Message : 37.824198 s : Average mflops/s per call per node (full): 2.49486e+06
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 41.538537 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 41.538549 s : Called DwDag
|
||||
Grid : Message : 41.538550 s : norm dag result 12.0422
|
||||
Grid : Message : 41.543416 s : norm dag ref 12.0422
|
||||
Grid : Message : 41.548999 s : norm dag diff 7.6086e-14
|
||||
Grid : Message : 41.563564 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 41.711516 s : src_e0.499992
|
||||
Grid : Message : 41.735103 s : src_o0.500008
|
||||
Grid : Message : 41.756142 s : *********************************************************
|
||||
Grid : Message : 41.756144 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 41.756145 s : * Vectorising space-time by 8
|
||||
Grid : Message : 41.756146 s : * SINGLE precision
|
||||
Grid : Message : 41.756147 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 41.756148 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 41.756148 s : *********************************************************
|
||||
Grid : Message : 59.255023 s : Deo mflop/s = 9.6274e+06
|
||||
Grid : Message : 59.255044 s : Deo mflop/s per rank 601712
|
||||
Grid : Message : 59.255046 s : Deo mflop/s per node 2.40685e+06
|
||||
Grid : Message : 59.255048 s : #### Dhop calls report
|
||||
Grid : Message : 59.255049 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 59.255050 s : WilsonFermion5D TotalTime /Calls : 5830.89 us
|
||||
Grid : Message : 59.255051 s : WilsonFermion5D CommTime /Calls : 5143.28 us
|
||||
Grid : Message : 59.255052 s : WilsonFermion5D FaceTime /Calls : 316.834 us
|
||||
Grid : Message : 59.255053 s : WilsonFermion5D ComputeTime1/Calls : 37.4065 us
|
||||
Grid : Message : 59.255054 s : WilsonFermion5D ComputeTime2/Calls : 375.889 us
|
||||
Grid : Message : 59.255076 s : Average mflops/s per call : 1.4225e+09
|
||||
Grid : Message : 59.255077 s : Average mflops/s per call per rank : 8.8906e+07
|
||||
Grid : Message : 59.255078 s : Average mflops/s per call per node : 3.55624e+08
|
||||
Grid : Message : 59.255079 s : Average mflops/s per call (full) : 9.78858e+06
|
||||
Grid : Message : 59.255080 s : Average mflops/s per call per rank (full): 611786
|
||||
Grid : Message : 59.255081 s : Average mflops/s per call per node (full): 2.44714e+06
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 59.286796 s : r_e6.02129
|
||||
Grid : Message : 59.290118 s : r_o6.02097
|
||||
Grid : Message : 59.292558 s : res12.0423
|
||||
Grid : Message : 59.482803 s : norm diff 0
|
||||
Grid : Message : 59.604297 s : norm diff even 0
|
||||
Grid : Message : 59.626743 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt1
Normal file
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt1
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fbae0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.692368 s : Grid Layout
|
||||
Grid : Message : 0.692369 s : Global lattice size : 48 48 48 48
|
||||
Grid : Message : 0.692372 s : OpenMP threads : 32
|
||||
Grid : Message : 0.692372 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.701977 s : Making s innermost grids
|
||||
Grid : Message : 0.711295 s : Initialising 4d RNG
|
||||
Grid : Message : 0.734938 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.734948 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.798281 s : Initialising 5d RNG
|
||||
Grid : Message : 1.161711 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.161728 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 1.522440 s : Initialised RNGs
|
||||
Grid : Message : 2.260710 s : Drawing gauge field
|
||||
Grid : Message : 2.102597 s : Random gauge initialised
|
||||
Grid : Message : 2.562592 s : Setting up Cshift based reference
|
||||
Grid : Message : 3.121880 s : *****************************************************************
|
||||
Grid : Message : 3.121970 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 3.121980 s : *****************************************************************
|
||||
Grid : Message : 3.121980 s : *****************************************************************
|
||||
Grid : Message : 3.121980 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 3.121980 s : * Vectorising space-time by 8
|
||||
Grid : Message : 3.121980 s : * VComplexF size is 64 B
|
||||
Grid : Message : 3.121990 s : * SINGLE precision
|
||||
Grid : Message : 3.121990 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 3.121990 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 3.121990 s : *****************************************************************
|
||||
Grid : Message : 3.350688 s : Called warmup
|
||||
Grid : Message : 35.847527 s : Called Dw 3000 times in 3.24968e+07 us
|
||||
Grid : Message : 35.847576 s : mflop/s = 1.035e+07
|
||||
Grid : Message : 35.847578 s : mflop/s per rank = 646874
|
||||
Grid : Message : 35.847579 s : mflop/s per node = 2.5875e+06
|
||||
Grid : Message : 35.847580 s : RF GiB/s (base 2) = 21030.9
|
||||
Grid : Message : 35.847581 s : mem GiB/s (base 2) = 13144.3
|
||||
Grid : Message : 35.848697 s : norm diff 1.03662e-13
|
||||
Grid : Message : 35.861967 s : #### Dhop calls report
|
||||
Grid : Message : 35.861973 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 35.861976 s : WilsonFermion5D TotalTime /Calls : 5426 us
|
||||
Grid : Message : 35.861977 s : WilsonFermion5D CommTime /Calls : 4817.47 us
|
||||
Grid : Message : 35.861978 s : WilsonFermion5D FaceTime /Calls : 246.175 us
|
||||
Grid : Message : 35.861979 s : WilsonFermion5D ComputeTime1/Calls : 8.72676 us
|
||||
Grid : Message : 35.861980 s : WilsonFermion5D ComputeTime2/Calls : 370.494 us
|
||||
Grid : Message : 35.861995 s : Average mflops/s per call : 6.50606e+09
|
||||
Grid : Message : 35.861999 s : Average mflops/s per call per rank : 4.06629e+08
|
||||
Grid : Message : 35.862000 s : Average mflops/s per call per node : 1.62652e+09
|
||||
Grid : Message : 35.862001 s : Average mflops/s per call (full) : 1.0519e+07
|
||||
Grid : Message : 35.862002 s : Average mflops/s per call per rank (full): 657438
|
||||
Grid : Message : 35.862003 s : Average mflops/s per call per node (full): 2.62975e+06
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 39.599406 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 39.599421 s : Called DwDag
|
||||
Grid : Message : 39.599422 s : norm dag result 12.0422
|
||||
Grid : Message : 39.604317 s : norm dag ref 12.0422
|
||||
Grid : Message : 39.609961 s : norm dag diff 7.6086e-14
|
||||
Grid : Message : 39.624145 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 39.772334 s : src_e0.499992
|
||||
Grid : Message : 39.795705 s : src_o0.500008
|
||||
Grid : Message : 39.816822 s : *********************************************************
|
||||
Grid : Message : 39.816824 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 39.816825 s : * Vectorising space-time by 8
|
||||
Grid : Message : 39.816826 s : * SINGLE precision
|
||||
Grid : Message : 39.816827 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 39.816828 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 39.816828 s : *********************************************************
|
||||
Grid : Message : 56.382758 s : Deo mflop/s = 1.017e+07
|
||||
Grid : Message : 56.382779 s : Deo mflop/s per rank 635627
|
||||
Grid : Message : 56.382781 s : Deo mflop/s per node 2.54251e+06
|
||||
Grid : Message : 56.382783 s : #### Dhop calls report
|
||||
Grid : Message : 56.382784 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 56.382785 s : WilsonFermion5D TotalTime /Calls : 5519.98 us
|
||||
Grid : Message : 56.382786 s : WilsonFermion5D CommTime /Calls : 4856.39 us
|
||||
Grid : Message : 56.382787 s : WilsonFermion5D FaceTime /Calls : 303.043 us
|
||||
Grid : Message : 56.382788 s : WilsonFermion5D ComputeTime1/Calls : 6.77807 us
|
||||
Grid : Message : 56.382789 s : WilsonFermion5D ComputeTime2/Calls : 376.551 us
|
||||
Grid : Message : 56.382810 s : Average mflops/s per call : 8.31124e+09
|
||||
Grid : Message : 56.382811 s : Average mflops/s per call per rank : 5.19453e+08
|
||||
Grid : Message : 56.382812 s : Average mflops/s per call per node : 2.07781e+09
|
||||
Grid : Message : 56.382813 s : Average mflops/s per call (full) : 1.03399e+07
|
||||
Grid : Message : 56.382814 s : Average mflops/s per call per rank (full): 646244
|
||||
Grid : Message : 56.382815 s : Average mflops/s per call per node (full): 2.58498e+06
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 56.414571 s : r_e6.02129
|
||||
Grid : Message : 56.417837 s : r_o6.02097
|
||||
Grid : Message : 56.420535 s : res12.0423
|
||||
Grid : Message : 56.611957 s : norm diff 0
|
||||
Grid : Message : 56.730597 s : norm diff even 0
|
||||
Grid : Message : 56.752566 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt0
Normal file
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt0
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fd460000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.667601 s : Grid Layout
|
||||
Grid : Message : 0.667602 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.667610 s : OpenMP threads : 32
|
||||
Grid : Message : 0.667611 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.702872 s : Making s innermost grids
|
||||
Grid : Message : 0.742911 s : Initialising 4d RNG
|
||||
Grid : Message : 0.813463 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.813479 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.922630 s : Initialising 5d RNG
|
||||
Grid : Message : 2.306290 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 2.306540 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 3.878430 s : Initialised RNGs
|
||||
Grid : Message : 4.536926 s : Drawing gauge field
|
||||
Grid : Message : 4.824391 s : Random gauge initialised
|
||||
Grid : Message : 6.253195 s : Setting up Cshift based reference
|
||||
Grid : Message : 7.326402 s : *****************************************************************
|
||||
Grid : Message : 7.326411 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 7.326412 s : *****************************************************************
|
||||
Grid : Message : 7.326412 s : *****************************************************************
|
||||
Grid : Message : 7.326412 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 7.326412 s : * Vectorising space-time by 8
|
||||
Grid : Message : 7.326413 s : * VComplexF size is 64 B
|
||||
Grid : Message : 7.326414 s : * SINGLE precision
|
||||
Grid : Message : 7.326414 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 7.326414 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 7.326414 s : *****************************************************************
|
||||
Grid : Message : 8.283417 s : Called warmup
|
||||
Grid : Message : 89.658859 s : Called Dw 3000 times in 8.13753e+07 us
|
||||
Grid : Message : 89.658898 s : mflop/s = 1.3063e+07
|
||||
Grid : Message : 89.658900 s : mflop/s per rank = 816437
|
||||
Grid : Message : 89.658901 s : mflop/s per node = 3.26575e+06
|
||||
Grid : Message : 89.658902 s : RF GiB/s (base 2) = 26543.7
|
||||
Grid : Message : 89.658903 s : mem GiB/s (base 2) = 16589.8
|
||||
Grid : Message : 89.662424 s : norm diff 1.03481e-13
|
||||
Grid : Message : 89.700433 s : #### Dhop calls report
|
||||
Grid : Message : 89.700452 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 89.700456 s : WilsonFermion5D TotalTime /Calls : 13588.2 us
|
||||
Grid : Message : 89.700457 s : WilsonFermion5D CommTime /Calls : 12137.3 us
|
||||
Grid : Message : 89.700458 s : WilsonFermion5D FaceTime /Calls : 548.408 us
|
||||
Grid : Message : 89.700459 s : WilsonFermion5D ComputeTime1/Calls : 42.6163 us
|
||||
Grid : Message : 89.700460 s : WilsonFermion5D ComputeTime2/Calls : 910.312 us
|
||||
Grid : Message : 89.700477 s : Average mflops/s per call : 4.43502e+09
|
||||
Grid : Message : 89.700493 s : Average mflops/s per call per rank : 2.77189e+08
|
||||
Grid : Message : 89.700494 s : Average mflops/s per call per node : 1.10875e+09
|
||||
Grid : Message : 89.700495 s : Average mflops/s per call (full) : 1.32753e+07
|
||||
Grid : Message : 89.700496 s : Average mflops/s per call per rank (full): 829709
|
||||
Grid : Message : 89.700497 s : Average mflops/s per call per node (full): 3.31884e+06
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 101.462401 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 101.462412 s : Called DwDag
|
||||
Grid : Message : 101.462413 s : norm dag result 12.0421
|
||||
Grid : Message : 101.474097 s : norm dag ref 12.0421
|
||||
Grid : Message : 101.489396 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 101.529094 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 101.996820 s : src_e0.499997
|
||||
Grid : Message : 102.626690 s : src_o0.500003
|
||||
Grid : Message : 102.125734 s : *********************************************************
|
||||
Grid : Message : 102.125736 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 102.125737 s : * Vectorising space-time by 8
|
||||
Grid : Message : 102.125738 s : * SINGLE precision
|
||||
Grid : Message : 102.125739 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 102.125739 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 102.125739 s : *********************************************************
|
||||
Grid : Message : 143.296910 s : Deo mflop/s = 1.30119e+07
|
||||
Grid : Message : 143.297140 s : Deo mflop/s per rank 813244
|
||||
Grid : Message : 143.297160 s : Deo mflop/s per node 3.25297e+06
|
||||
Grid : Message : 143.297180 s : #### Dhop calls report
|
||||
Grid : Message : 143.297190 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 143.297200 s : WilsonFermion5D TotalTime /Calls : 13630 us
|
||||
Grid : Message : 143.297210 s : WilsonFermion5D CommTime /Calls : 12124.9 us
|
||||
Grid : Message : 143.297220 s : WilsonFermion5D FaceTime /Calls : 590.958 us
|
||||
Grid : Message : 143.297230 s : WilsonFermion5D ComputeTime1/Calls : 43.2806 us
|
||||
Grid : Message : 143.297240 s : WilsonFermion5D ComputeTime2/Calls : 921.187 us
|
||||
Grid : Message : 143.297460 s : Average mflops/s per call : 4.24329e+09
|
||||
Grid : Message : 143.297470 s : Average mflops/s per call per rank : 2.65206e+08
|
||||
Grid : Message : 143.297480 s : Average mflops/s per call per node : 1.06082e+09
|
||||
Grid : Message : 143.297490 s : Average mflops/s per call (full) : 1.32347e+07
|
||||
Grid : Message : 143.297500 s : Average mflops/s per call per rank (full): 827169
|
||||
Grid : Message : 143.297510 s : Average mflops/s per call per node (full): 3.30868e+06
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 143.112368 s : r_e6.02111
|
||||
Grid : Message : 143.119760 s : r_o6.02102
|
||||
Grid : Message : 143.126239 s : res12.0421
|
||||
Grid : Message : 143.720780 s : norm diff 0
|
||||
Grid : Message : 144.885380 s : norm diff even 0
|
||||
Grid : Message : 144.154396 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt1
Normal file
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt1
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7f4b80000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.648397 s : Grid Layout
|
||||
Grid : Message : 0.648398 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.648401 s : OpenMP threads : 32
|
||||
Grid : Message : 0.648402 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.663662 s : Making s innermost grids
|
||||
Grid : Message : 0.682145 s : Initialising 4d RNG
|
||||
Grid : Message : 0.754321 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.754332 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.863265 s : Initialising 5d RNG
|
||||
Grid : Message : 1.967677 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.967691 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 2.921676 s : Initialised RNGs
|
||||
Grid : Message : 4.382384 s : Drawing gauge field
|
||||
Grid : Message : 4.672590 s : Random gauge initialised
|
||||
Grid : Message : 6.102697 s : Setting up Cshift based reference
|
||||
Grid : Message : 7.185897 s : *****************************************************************
|
||||
Grid : Message : 7.185906 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 7.185907 s : *****************************************************************
|
||||
Grid : Message : 7.185907 s : *****************************************************************
|
||||
Grid : Message : 7.185907 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 7.185907 s : * Vectorising space-time by 8
|
||||
Grid : Message : 7.185908 s : * VComplexF size is 64 B
|
||||
Grid : Message : 7.185909 s : * SINGLE precision
|
||||
Grid : Message : 7.185909 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 7.185909 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 7.185909 s : *****************************************************************
|
||||
Grid : Message : 8.114241 s : Called warmup
|
||||
Grid : Message : 83.988100 s : Called Dw 3000 times in 7.48954e+07 us
|
||||
Grid : Message : 83.992400 s : mflop/s = 1.41932e+07
|
||||
Grid : Message : 83.992600 s : mflop/s per rank = 887074
|
||||
Grid : Message : 83.992700 s : mflop/s per node = 3.5483e+06
|
||||
Grid : Message : 83.992800 s : RF GiB/s (base 2) = 28840.2
|
||||
Grid : Message : 83.992900 s : mem GiB/s (base 2) = 18025.1
|
||||
Grid : Message : 83.134870 s : norm diff 1.03481e-13
|
||||
Grid : Message : 83.493960 s : #### Dhop calls report
|
||||
Grid : Message : 83.494000 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 83.494030 s : WilsonFermion5D TotalTime /Calls : 12506 us
|
||||
Grid : Message : 83.494040 s : WilsonFermion5D CommTime /Calls : 11071.5 us
|
||||
Grid : Message : 83.494050 s : WilsonFermion5D FaceTime /Calls : 530.971 us
|
||||
Grid : Message : 83.494060 s : WilsonFermion5D ComputeTime1/Calls : 23.6428 us
|
||||
Grid : Message : 83.494070 s : WilsonFermion5D ComputeTime2/Calls : 911.864 us
|
||||
Grid : Message : 83.494220 s : Average mflops/s per call : 7.6108e+09
|
||||
Grid : Message : 83.494250 s : Average mflops/s per call per rank : 4.75675e+08
|
||||
Grid : Message : 83.494260 s : Average mflops/s per call per node : 1.9027e+09
|
||||
Grid : Message : 83.494270 s : Average mflops/s per call (full) : 1.44242e+07
|
||||
Grid : Message : 83.494280 s : Average mflops/s per call per rank (full): 901513
|
||||
Grid : Message : 83.494290 s : Average mflops/s per call per node (full): 3.60605e+06
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 94.600488 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 94.600501 s : Called DwDag
|
||||
Grid : Message : 94.600502 s : norm dag result 12.0421
|
||||
Grid : Message : 94.613445 s : norm dag ref 12.0421
|
||||
Grid : Message : 94.628514 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 94.666370 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 95.136361 s : src_e0.499997
|
||||
Grid : Message : 95.208108 s : src_o0.500003
|
||||
Grid : Message : 95.271511 s : *********************************************************
|
||||
Grid : Message : 95.271512 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 95.271513 s : * Vectorising space-time by 8
|
||||
Grid : Message : 95.271514 s : * SINGLE precision
|
||||
Grid : Message : 95.271514 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 95.271515 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 95.271515 s : *********************************************************
|
||||
Grid : Message : 132.766274 s : Deo mflop/s = 1.41952e+07
|
||||
Grid : Message : 132.766295 s : Deo mflop/s per rank 887201
|
||||
Grid : Message : 132.766297 s : Deo mflop/s per node 3.5488e+06
|
||||
Grid : Message : 132.766299 s : #### Dhop calls report
|
||||
Grid : Message : 132.766300 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 132.766301 s : WilsonFermion5D TotalTime /Calls : 12493.9 us
|
||||
Grid : Message : 132.766302 s : WilsonFermion5D CommTime /Calls : 10990.2 us
|
||||
Grid : Message : 132.766303 s : WilsonFermion5D FaceTime /Calls : 604.889 us
|
||||
Grid : Message : 132.766304 s : WilsonFermion5D ComputeTime1/Calls : 13.7158 us
|
||||
Grid : Message : 132.766305 s : WilsonFermion5D ComputeTime2/Calls : 920.096 us
|
||||
Grid : Message : 132.766326 s : Average mflops/s per call : 1.31121e+10
|
||||
Grid : Message : 132.766328 s : Average mflops/s per call per rank : 8.19504e+08
|
||||
Grid : Message : 132.766329 s : Average mflops/s per call per node : 3.27802e+09
|
||||
Grid : Message : 132.766330 s : Average mflops/s per call (full) : 1.44381e+07
|
||||
Grid : Message : 132.766331 s : Average mflops/s per call per rank (full): 902382
|
||||
Grid : Message : 132.766332 s : Average mflops/s per call per node (full): 3.60953e+06
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 132.847999 s : r_e6.02111
|
||||
Grid : Message : 132.854237 s : r_o6.02102
|
||||
Grid : Message : 132.860309 s : res12.0421
|
||||
Grid : Message : 133.458462 s : norm diff 0
|
||||
Grid : Message : 133.832713 s : norm diff even 0
|
||||
Grid : Message : 133.909147 s : norm diff odd 0
|
24
systems/Perlmutter/dwf4.slurm
Normal file
24
systems/Perlmutter/dwf4.slurm
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#SBATCH -A mp13
|
||||
#SBATCH -C gpu
|
||||
#SBATCH -q regular
|
||||
#SBATCH -t 0:20:00
|
||||
#SBATCH -n 16
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH -c 32
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3
|
||||
|
||||
export SLURM_CPU_BIND="cores"
|
||||
export MPICH_RDMA_ENABLED_CUDA=1
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2 --accelerator-threads 8 > comms.4node
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent --shm-mpi 0"
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent --shm-mpi 1"
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1
|
4
systems/Perlmutter/sourceme.sh
Normal file
4
systems/Perlmutter/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
||||
|
||||
export CRAY_ACCEL_TARGET=nvidia80
|
||||
|
||||
module load PrgEnv-gnu cpe-cuda cuda
|
Loading…
Reference in New Issue
Block a user