mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Dirichlet first cut - wrong answers on dagger multiply.
Struggling to get a compute node so changing systems
This commit is contained in:
		@@ -1,25 +1,25 @@
 | 
			
		||||
tu-c0r0n00 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r0n00 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r0n09 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r0n00 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r0n06 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r0n06 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r0n09 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r0n09 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r0n03 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r0n06 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r0n09 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r0n00 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r0n03 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r0n03 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r0n06 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r0n03 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r3n00 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r3n00 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r3n00 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r3n00 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r3n06 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r3n06 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r3n06 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r3n06 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r3n03 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r3n03 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r3n03 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r3n03 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
tu-c0r3n09 - 0 device=0 binding=--interleave=0,1
 | 
			
		||||
tu-c0r3n09 - 1 device=1 binding=--interleave=2,3
 | 
			
		||||
tu-c0r3n09 - 2 device=2 binding=--interleave=4,5
 | 
			
		||||
tu-c0r3n09 - 3 device=3 binding=--interleave=6,7
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit[0]: ========================
 | 
			
		||||
AcceleratorCudaInit[0]: Device Number    : 0
 | 
			
		||||
@@ -33,11 +33,41 @@ AcceleratorCudaInit[0]:   pciBusID: 3
 | 
			
		||||
AcceleratorCudaInit[0]:   pciDeviceID: 0 
 | 
			
		||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
AcceleratorCudaInit[0]: ========================
 | 
			
		||||
AcceleratorCudaInit[0]: Device Number    : 0
 | 
			
		||||
AcceleratorCudaInit[0]: ========================
 | 
			
		||||
@@ -50,43 +80,25 @@ AcceleratorCudaInit[0]:   pciBusID: 3
 | 
			
		||||
AcceleratorCudaInit[0]:   pciDeviceID: 0 
 | 
			
		||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses
 | 
			
		||||
AcceleratorCudaInit: a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-setdevice=no 
 | 
			
		||||
local rank 1 device 0 bus id: 0000:44:00.0
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
local rank 0 device 0 bus id: 0000:03:00.0
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
OPENMPI detected
 | 
			
		||||
AcceleratorCudaInit: using default device 
 | 
			
		||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or 
 | 
			
		||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
 | 
			
		||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no 
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
local rank 0 device 0 bus id: 0000:03:00.0
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
AcceleratorCudaInit: ================================================
 | 
			
		||||
local rank 2 device 0 bus id: 0000:84:00.0
 | 
			
		||||
SharedMemoryMpi:  World communicator of size 16
 | 
			
		||||
SharedMemoryMpi:  Node  communicator of size 4
 | 
			
		||||
0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fcd80000000 for comms buffers 
 | 
			
		||||
0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x153960000000 for comms buffers 
 | 
			
		||||
Setting up IPC
 | 
			
		||||
 | 
			
		||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
 | 
			
		||||
@@ -116,7 +128,7 @@ This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes
 | 
			
		||||
Current Grid git commit hash=da06d15f73184ceb15d66d4e7e702b02fed7b940: (HEAD -> feature/dirichlet, develop) uncommited changes
 | 
			
		||||
 | 
			
		||||
Grid : Message : ================================================ 
 | 
			
		||||
Grid : Message : MPI is initialised and logging filters activated 
 | 
			
		||||
@@ -124,122 +136,102 @@ Grid : Message : ================================================
 | 
			
		||||
Grid : Message : Requested 2147483648 byte stencil comms buffers 
 | 
			
		||||
Grid : Message : MemoryManager Cache 34004218675 bytes 
 | 
			
		||||
Grid : Message : MemoryManager::Init() setting up
 | 
			
		||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
 | 
			
		||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
 | 
			
		||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
 | 
			
		||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
 | 
			
		||||
Grid : Message : 1.198523 s : Grid Layout
 | 
			
		||||
Grid : Message : 1.198530 s : 	Global lattice size  : 64 64 64 64 
 | 
			
		||||
Grid : Message : 1.198534 s : 	OpenMP threads       : 4
 | 
			
		||||
Grid : Message : 1.198535 s : 	MPI tasks            : 2 2 2 2 
 | 
			
		||||
Grid : Message : 1.397615 s : Making s innermost grids
 | 
			
		||||
Grid : Message : 1.441828 s : Initialising 4d RNG
 | 
			
		||||
Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG'
 | 
			
		||||
Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 | 
			
		||||
Grid : Message : 1.954777 s : Initialising 5d RNG
 | 
			
		||||
Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG'
 | 
			
		||||
Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 | 
			
		||||
Grid : Message : 12.162710 s : Initialised RNGs
 | 
			
		||||
Grid : Message : 15.882520 s : Drawing gauge field
 | 
			
		||||
Grid : Message : 15.816362 s : Random gauge initialised 
 | 
			
		||||
Grid : Message : 17.279671 s : Setting up Cshift based reference 
 | 
			
		||||
Grid : Message : 26.331426 s : *****************************************************************
 | 
			
		||||
Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 | 
			
		||||
Grid : Message : 26.331454 s : *****************************************************************
 | 
			
		||||
Grid : Message : 26.331456 s : *****************************************************************
 | 
			
		||||
Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop                  
 | 
			
		||||
Grid : Message : 26.331459 s : * Vectorising space-time by 8
 | 
			
		||||
Grid : Message : 26.331463 s : * VComplexF size is 64 B
 | 
			
		||||
Grid : Message : 26.331465 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 26.331469 s : *****************************************************************
 | 
			
		||||
Grid : Message : 28.413717 s : Called warmup
 | 
			
		||||
Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us
 | 
			
		||||
Grid : Message : 56.418476 s : mflop/s =   3.79581e+07
 | 
			
		||||
Grid : Message : 56.418479 s : mflop/s per rank =  2.37238e+06
 | 
			
		||||
Grid : Message : 56.418481 s : mflop/s per node =  9.48953e+06
 | 
			
		||||
Grid : Message : 56.418483 s : RF  GiB/s (base 2) =   77130
 | 
			
		||||
Grid : Message : 56.418485 s : mem GiB/s (base 2) =   48206.3
 | 
			
		||||
Grid : Message : 56.422076 s : norm diff   1.03481e-13
 | 
			
		||||
Grid : Message : 56.456894 s : #### Dhop calls report 
 | 
			
		||||
Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls   : 6002
 | 
			
		||||
Grid : Message : 56.456903 s : WilsonFermion5D TotalTime   /Calls        : 4710.93 us
 | 
			
		||||
Grid : Message : 56.456905 s : WilsonFermion5D CommTime    /Calls        : 3196.15 us
 | 
			
		||||
Grid : Message : 56.456908 s : WilsonFermion5D FaceTime    /Calls        : 494.392 us
 | 
			
		||||
Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls        : 44.4107 us
 | 
			
		||||
Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls        : 1037.75 us
 | 
			
		||||
Grid : Message : 56.456921 s : Average mflops/s per call                : 3.55691e+09
 | 
			
		||||
Grid : Message : 56.456925 s : Average mflops/s per call per rank       : 2.22307e+08
 | 
			
		||||
Grid : Message : 56.456928 s : Average mflops/s per call per node       : 8.89228e+08
 | 
			
		||||
Grid : Message : 56.456930 s : Average mflops/s per call (full)         : 3.82915e+07
 | 
			
		||||
Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06
 | 
			
		||||
Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06
 | 
			
		||||
Grid : Message : 56.456954 s : WilsonFermion5D Stencil
 | 
			
		||||
Grid : Message : 56.457016 s :  Stencil calls 3001
 | 
			
		||||
Grid : Message : 56.457022 s :  Stencil halogtime 0
 | 
			
		||||
Grid : Message : 56.457024 s :  Stencil gathertime 55.9154
 | 
			
		||||
Grid : Message : 56.457026 s :  Stencil gathermtime 20.1073
 | 
			
		||||
Grid : Message : 56.457028 s :  Stencil mergetime 18.5585
 | 
			
		||||
Grid : Message : 56.457030 s :  Stencil decompresstime 0.0639787
 | 
			
		||||
Grid : Message : 56.457032 s :  Stencil comms_bytes 4.02653e+08
 | 
			
		||||
Grid : Message : 56.457034 s :  Stencil commtime 6379.93
 | 
			
		||||
Grid : Message : 56.457036 s :  Stencil 63.1124 GB/s per rank
 | 
			
		||||
Grid : Message : 56.457038 s :  Stencil 252.45 GB/s per node
 | 
			
		||||
Grid : Message : 56.457040 s : WilsonFermion5D StencilEven
 | 
			
		||||
Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd
 | 
			
		||||
Grid : Message : 56.457062 s : WilsonFermion5D Stencil     Reporti()
 | 
			
		||||
Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti()
 | 
			
		||||
Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd  Reporti()
 | 
			
		||||
Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness
 | 
			
		||||
Grid : Message : 79.259287 s : Called DwDag
 | 
			
		||||
Grid : Message : 79.259288 s : norm dag result 12.0421
 | 
			
		||||
Grid : Message : 79.271740 s : norm dag ref    12.0421
 | 
			
		||||
Grid : Message : 79.287759 s : norm dag diff   7.63236e-14
 | 
			
		||||
Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 | 
			
		||||
Grid : Message : 79.955951 s : src_e0.499997
 | 
			
		||||
Grid : Message : 80.633620 s : src_o0.500003
 | 
			
		||||
Grid : Message : 80.164163 s : *********************************************************
 | 
			
		||||
Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO                
 | 
			
		||||
Grid : Message : 80.164170 s : * Vectorising space-time by 8
 | 
			
		||||
Grid : Message : 80.164172 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 80.164178 s : *********************************************************
 | 
			
		||||
Grid : Message : 93.797635 s : Deo mflop/s =   3.93231e+07
 | 
			
		||||
Grid : Message : 93.797670 s : Deo mflop/s per rank   2.45769e+06
 | 
			
		||||
Grid : Message : 93.797672 s : Deo mflop/s per node   9.83077e+06
 | 
			
		||||
Grid : Message : 93.797674 s : #### Dhop calls report 
 | 
			
		||||
Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls   : 3001
 | 
			
		||||
Grid : Message : 93.797677 s : WilsonFermion5D TotalTime   /Calls        : 4542.83 us
 | 
			
		||||
Grid : Message : 93.797679 s : WilsonFermion5D CommTime    /Calls        : 2978.97 us
 | 
			
		||||
Grid : Message : 93.797681 s : WilsonFermion5D FaceTime    /Calls        : 602.287 us
 | 
			
		||||
Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls        : 67.1416 us
 | 
			
		||||
Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls        : 1004.07 us
 | 
			
		||||
Grid : Message : 93.797713 s : Average mflops/s per call                : 3.30731e+09
 | 
			
		||||
Grid : Message : 93.797717 s : Average mflops/s per call per rank       : 2.06707e+08
 | 
			
		||||
Grid : Message : 93.797719 s : Average mflops/s per call per node       : 8.26827e+08
 | 
			
		||||
Grid : Message : 93.797721 s : Average mflops/s per call (full)         : 3.97084e+07
 | 
			
		||||
Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06
 | 
			
		||||
Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06
 | 
			
		||||
Grid : Message : 93.797735 s : WilsonFermion5D Stencil
 | 
			
		||||
Grid : Message : 93.797746 s : WilsonFermion5D StencilEven
 | 
			
		||||
Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd
 | 
			
		||||
Grid : Message : 93.797769 s :  Stencil calls 3001
 | 
			
		||||
Grid : Message : 93.797773 s :  Stencil halogtime 0
 | 
			
		||||
Grid : Message : 93.797776 s :  Stencil gathertime 56.7458
 | 
			
		||||
Grid : Message : 93.797780 s :  Stencil gathermtime 22.6504
 | 
			
		||||
Grid : Message : 93.797782 s :  Stencil mergetime 21.1913
 | 
			
		||||
Grid : Message : 93.797786 s :  Stencil decompresstime 0.0556481
 | 
			
		||||
Grid : Message : 93.797788 s :  Stencil comms_bytes 2.01327e+08
 | 
			
		||||
Grid : Message : 93.797791 s :  Stencil commtime 2989.33
 | 
			
		||||
Grid : Message : 93.797795 s :  Stencil 67.3484 GB/s per rank
 | 
			
		||||
Grid : Message : 93.797798 s :  Stencil 269.394 GB/s per node
 | 
			
		||||
Grid : Message : 93.797801 s : WilsonFermion5D Stencil     Reporti()
 | 
			
		||||
Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti()
 | 
			
		||||
Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd  Reporti()
 | 
			
		||||
Grid : Message : 93.873429 s : r_e6.02111
 | 
			
		||||
Grid : Message : 93.879931 s : r_o6.02102
 | 
			
		||||
Grid : Message : 93.885912 s : res12.0421
 | 
			
		||||
Grid : Message : 94.876555 s : norm diff   0
 | 
			
		||||
Grid : Message : 95.485643 s : norm diff even  0
 | 
			
		||||
Grid : Message : 95.581236 s : norm diff odd   0
 | 
			
		||||
Grid : Message : 1.875883 s : Grid Layout
 | 
			
		||||
Grid : Message : 1.875893 s : 	Global lattice size  : 64 64 64 64 
 | 
			
		||||
Grid : Message : 1.875897 s : 	OpenMP threads       : 4
 | 
			
		||||
Grid : Message : 1.875898 s : 	MPI tasks            : 2 2 2 2 
 | 
			
		||||
Grid : Message : 1.993571 s : Initialising 4d RNG
 | 
			
		||||
Grid : Message : 2.881990 s : Intialising parallel RNG with unique string 'The 4D RNG'
 | 
			
		||||
Grid : Message : 2.882370 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 | 
			
		||||
Grid : Message : 2.495044 s : Initialising 5d RNG
 | 
			
		||||
Grid : Message : 4.120900 s : Intialising parallel RNG with unique string 'The 5D RNG'
 | 
			
		||||
Grid : Message : 4.121350 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 | 
			
		||||
Grid : Message : 15.268010 s : Drawing gauge field
 | 
			
		||||
Grid : Message : 16.234025 s : Random gauge initialised 
 | 
			
		||||
Grid : Message : 16.234057 s : Applying BCs 
 | 
			
		||||
Grid : Message : 16.365565 s : Setting up Cshift based reference 
 | 
			
		||||
Grid : Message : 44.512418 s : *****************************************************************
 | 
			
		||||
Grid : Message : 44.512448 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 | 
			
		||||
Grid : Message : 44.512450 s : *****************************************************************
 | 
			
		||||
Grid : Message : 44.512451 s : *****************************************************************
 | 
			
		||||
Grid : Message : 44.512452 s : * Benchmarking DomainWallFermionR::Dhop                  
 | 
			
		||||
Grid : Message : 44.512453 s : * Vectorising space-time by 8
 | 
			
		||||
Grid : Message : 44.512454 s : * VComplexF size is 64 B
 | 
			
		||||
Grid : Message : 44.512456 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 44.512459 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 44.512460 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 44.512461 s : *****************************************************************
 | 
			
		||||
Grid : Message : 46.389070 s : Called warmup
 | 
			
		||||
Grid : Message : 49.211265 s : Called Dw 300 times in 2.82203e+06 us
 | 
			
		||||
Grid : Message : 49.211295 s : mflop/s =   3.76681e+07
 | 
			
		||||
Grid : Message : 49.211297 s : mflop/s per rank =  2.35425e+06
 | 
			
		||||
Grid : Message : 49.211299 s : mflop/s per node =  9.41702e+06
 | 
			
		||||
Grid : Message : 49.211301 s : RF  GiB/s (base 2) =   76540.6
 | 
			
		||||
Grid : Message : 49.211308 s : mem GiB/s (base 2) =   47837.9
 | 
			
		||||
Grid : Message : 49.214868 s : norm diff   1.06409e-13
 | 
			
		||||
Grid : Message : 92.647781 s : Compare to naive wilson implementation Dag to verify correctness
 | 
			
		||||
Grid : Message : 92.647816 s : Called DwDag
 | 
			
		||||
Grid : Message : 92.647817 s : norm dag result 12.0421
 | 
			
		||||
Grid : Message : 92.801806 s : norm dag ref    12.0421
 | 
			
		||||
Grid : Message : 92.817724 s : norm dag diff   7.21921e-14
 | 
			
		||||
Grid : Message : 92.858973 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
 | 
			
		||||
Grid : Message : 93.210378 s : src_e0.499997
 | 
			
		||||
Grid : Message : 93.583286 s : src_o0.500003
 | 
			
		||||
Grid : Message : 93.682468 s : *********************************************************
 | 
			
		||||
Grid : Message : 93.682471 s : * Benchmarking DomainWallFermionF::DhopEO                
 | 
			
		||||
Grid : Message : 93.682472 s : * Vectorising space-time by 8
 | 
			
		||||
Grid : Message : 93.682473 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 93.682475 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 93.682476 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 93.682477 s : *********************************************************
 | 
			
		||||
Grid : Message : 95.162342 s : Deo mflop/s =   3.92487e+07
 | 
			
		||||
Grid : Message : 95.162387 s : Deo mflop/s per rank   2.45305e+06
 | 
			
		||||
Grid : Message : 95.162389 s : Deo mflop/s per node   9.81219e+06
 | 
			
		||||
Grid : Message : 95.232801 s : r_e6.02111
 | 
			
		||||
Grid : Message : 95.240061 s : r_o6.02102
 | 
			
		||||
Grid : Message : 95.245975 s : res12.0421
 | 
			
		||||
Grid : Message : 95.833402 s : norm diff   0
 | 
			
		||||
Grid : Message : 96.573829 s : norm diff even  0
 | 
			
		||||
Grid : Message : 96.868272 s : norm diff odd   0
 | 
			
		||||
 Dirichlet block [0 64 64 32 32]
 | 
			
		||||
Grid : Message : 97.756909 s : Grid Layout
 | 
			
		||||
Grid : Message : 97.756911 s : 	Global lattice size  : 64 64 64 64 
 | 
			
		||||
Grid : Message : 97.756921 s : 	OpenMP threads       : 4
 | 
			
		||||
Grid : Message : 97.756922 s : 	MPI tasks            : 2 2 2 2 
 | 
			
		||||
Grid : Message : 97.897085 s : Initialising 4d RNG
 | 
			
		||||
Grid : Message : 97.965061 s : Intialising parallel RNG with unique string 'The 4D RNG'
 | 
			
		||||
Grid : Message : 97.965097 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
 | 
			
		||||
Grid : Message : 98.367431 s : Initialising 5d RNG
 | 
			
		||||
Grid : Message : 99.752745 s : Intialising parallel RNG with unique string 'The 5D RNG'
 | 
			
		||||
Grid : Message : 99.752790 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
 | 
			
		||||
Grid : Message : 111.290148 s : Drawing gauge field
 | 
			
		||||
Grid : Message : 112.349289 s : Random gauge initialised 
 | 
			
		||||
Grid : Message : 112.349320 s : Applying BCs 
 | 
			
		||||
Grid : Message : 113.948740 s : Setting up Cshift based reference 
 | 
			
		||||
Grid : Message : 140.320415 s : *****************************************************************
 | 
			
		||||
Grid : Message : 140.320443 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
 | 
			
		||||
Grid : Message : 140.320444 s : *****************************************************************
 | 
			
		||||
Grid : Message : 140.320445 s : *****************************************************************
 | 
			
		||||
Grid : Message : 140.320446 s : * Benchmarking DomainWallFermionR::Dhop                  
 | 
			
		||||
Grid : Message : 140.320447 s : * Vectorising space-time by 8
 | 
			
		||||
Grid : Message : 140.320448 s : * VComplexF size is 64 B
 | 
			
		||||
Grid : Message : 140.320450 s : * SINGLE precision 
 | 
			
		||||
Grid : Message : 140.320451 s : * Using Overlapped Comms/Compute
 | 
			
		||||
Grid : Message : 140.320452 s : * Using GENERIC Nc WilsonKernels
 | 
			
		||||
Grid : Message : 140.320453 s : *****************************************************************
 | 
			
		||||
Grid : Message : 142.296150 s : Called warmup
 | 
			
		||||
Grid : Message : 144.397678 s : Called Dw 300 times in 2.36719e+06 us
 | 
			
		||||
Grid : Message : 144.397700 s : mflop/s =   4.49058e+07
 | 
			
		||||
Grid : Message : 144.397702 s : mflop/s per rank =  2.80661e+06
 | 
			
		||||
Grid : Message : 144.397704 s : mflop/s per node =  1.12265e+07
 | 
			
		||||
Grid : Message : 144.397706 s : RF  GiB/s (base 2) =   91247.6
 | 
			
		||||
Grid : Message : 144.397708 s : mem GiB/s (base 2) =   57029.7
 | 
			
		||||
Grid : Message : 144.401269 s : norm diff   9.78944e-14
 | 
			
		||||
Grid : Message : 186.885460 s : Compare to naive wilson implementation Dag to verify correctness
 | 
			
		||||
Grid : Message : 186.885492 s : Called DwDag
 | 
			
		||||
Grid : Message : 186.885493 s : norm dag result 10.4157
 | 
			
		||||
Grid : Message : 186.897154 s : norm dag ref    11.2266
 | 
			
		||||
Grid : Message : 186.912538 s : norm dag diff   0.484633
 | 
			
		||||
 
 | 
			
		||||
@@ -1,14 +1,13 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
#SBATCH -J dslash
 | 
			
		||||
#SBATCH -A tc002
 | 
			
		||||
#SBATCH -t 2:20:00
 | 
			
		||||
#SBATCH --nodelist=tu-c0r0n[00,03,06,09]
 | 
			
		||||
#SBATCH -A dp207
 | 
			
		||||
#SBATCH --exclusive
 | 
			
		||||
#SBATCH --nodes=4
 | 
			
		||||
#SBATCH --ntasks=16
 | 
			
		||||
#SBATCH --qos=standard
 | 
			
		||||
#SBATCH --ntasks-per-node=4
 | 
			
		||||
#SBATCH --cpus-per-task=8
 | 
			
		||||
#SBATCH --time=12:00:00
 | 
			
		||||
#SBATCH --time=0:05:00
 | 
			
		||||
#SBATCH --partition=gpu
 | 
			
		||||
#SBATCH --gres=gpu:4
 | 
			
		||||
#SBATCH --output=%x.%j.out
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user