1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-07 12:44:37 +01:00

Compare commits

..

166 Commits

Author SHA1 Message Date
Peter Boyle 4aa0bca4dc Change sum operation to use gpucub mistake in PR from Chris
Updated the sum operation definition for GPU reduction to use gpucub instead of cub.
2026-06-01 14:12:25 -04:00
Peter Boyle 905da6f083 Merge branch 'feature/reduction-reorganisation' into develop 2026-05-27 21:01:30 -04:00
Peter Boyle 86c7f29183 Config command update 2026-05-27 16:19:33 -04:00
Peter Boyle b0c99f876e Configure on mac update 2026-05-27 16:16:55 -04:00
Peter Boyle bf5fcdc860 Ease of use for std::complex interchangable with thrust 2026-05-27 16:05:37 -04:00
Peter Boyle b58a1508fa Perlmutter cuda version update 2026-05-21 13:25:13 -07:00
Peter Boyle 4d527e81fa Remove hip specific files 2026-05-21 12:34:30 -04:00
Peter Boyle 7803580aa6 Lattice_reduction_gpu: demote timing logs to Debug, disable by default
skills/mpi-heterogeneous: add Bug Class 4 for Frontier GTL/libamdhip64 ABI mismatch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 32654db366 Test_planned_fft: fix PlannedFFT template parameter to use ::vector_object
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle cd340cfab3 tests: add Test_planned_fft exercising PlannedFFT<vobj>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle f32866b2ff tests/fft: remove PlanDestroy calls (FFT handles plans per-call)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 1cd1dc091e FFT: add FFTbase, PlannedFFT; factor FFT_dim_execute free function
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 0493656e86 debug: add Test_hipfft_repro — reproducer for hipFFT PARSE_ERROR on ROCm 7
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 66fd504c4d tests/debug: add G=4 to hipfft fail reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle be4dd2b52f tests/debug: test hipMemset variant before cache is populated
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 707d059766 tests/debug: extend hipfft fail reproducer with hipMemset and sync variants
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle f08c755ae6 FFT: use host stack buffer in PlanCreate, not deviceVector
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle dbbfdd4e4b tests/debug: add minimal hipfft ordering bug fail/pass pair
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle f967fb40bf tests/debug: test plan-before-malloc vs malloc-before-plan ordering
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 74e0f846cb tests/debug: extend hipfft reproducer with Grid-realistic howmany and exec tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 303a4d26e5 tests/debug: add minimal hipfft plan-creation reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 119888653c FFT HIP: use hipfftCreate+hipfftMakePlanMany instead of hipfftPlanMany 2026-05-21 12:34:30 -04:00
Peter Boyle a9f42c08f9 FFT: pass nullptr for inembed/onembed in hipfftPlanMany to avoid HIPFFT_PARSE_ERROR 2026-05-21 12:34:30 -04:00
Peter Boyle e79adc9d31 FFT: cache plans per vobj type across calls
Plans are created lazily on the first FFT_dim call and reused for all
subsequent calls on the same FFT object.  PlanCreate<vobj>() can be
called explicitly to pre-warm the cache.  PlanDestroy() must be called
before switching to a different vobj type; the destructor cleans up any
live plans automatically.

Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the
LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 5a9056cd93 Accelerator: lower default accelerator_threads from 16 to 8
Benchmark_dwf_fp32 on MI250X GCD: 1.7 TF/s at nt=8, ~300 GF/s at nt=16.
With Nsimd=8 (fp32, GEN_SIMD_WIDTH=64B), nt=8 gives exactly 64 threads =
one full AMD wavefront. Higher values double register demand per block and
hit a register-pressure cliff for stencil kernels.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 012c36ab5a Accelerator: raise default accelerator_threads from 2 to 16 2026-05-21 12:34:30 -04:00
Peter Boyle 5c4574f9aa skills: add gpu-memory-performance.md
Documents the acceleratorThreads() default=2 trap, LambdaApply thread
mapping, coalescedRead/Write idiom, when to use __global__ vs
accelerator_for, and fused vs staged HBM access patterns.

Includes observed MI250X numbers from LatticePropagatorD reduction
(50 → 297 → 546 GB/s progression).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle a424775884 sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel
Replace the two-kernel pack+reduce sequence with a single fused kernel
packReduceKernel<R> that reads R words of each vobj at offset 'base'
and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating
the intermediate bundle buffer entirely.

HBM access per word-group drops from 3x (pack-read + pack-write +
reduce-read) to 1x.  Thread count comes from getNumBlocksAndThreads
(warpSize..256) rather than acceleratorThreads(), so occupancy is
correct regardless of the --accelerator-threads setting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle d6b1388741 Modified repack 2026-05-21 12:34:30 -04:00
Peter Boyle 796c6cae4e Enable GRID_REDUCTION_TIMING unconditionally
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 1a8064d6d9 Lattice_reduction_gpu: add GRID_REDUCTION_TIMING instrumentation
Uncomment #define GRID_REDUCTION_TIMING to enable per-phase timing output:

  sumD_gpu_reduce_words: pack time (accelerator_for) per R and base
  sumD_gpu_small:        reduceKernel+barrier time and D2H time separately
  sumD_gpu_large:        total wall time across all word groups

This lets us identify whether the large-type bottleneck is in the pack
kernel, the shared-memory reduction kernel, the barrier, or the D2H.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 43648924c3 sumD_gpu_large: radix-12 word-bundle reduction replacing radix-1
Replace the word-by-word loop (one kernel launch per scalar word) with
sumD_gpu_reduce_words<R> which packs R consecutive vector_type words per
site into iVector<iScalar<vector>,R>, then calls the existing sumD_gpu_small
shared-memory kernel once for the whole bundle.

Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for
any final < 4 words.  For LatticePropagator (144 words = 12x12), this
reduces the kernel-launch count from 144 to 12 -- a 12x reduction.

Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD
lane extraction and double-precision promotion identically to the scalar
word case.  sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within
sharedMemPerBlock on all supported devices.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle bf2140e74d Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor
Accumulate in sobjD throughout rather than accumulating in sobj and
converting the final sum. For float fields this matters: summing N floats
then casting loses O(N*eps_float) relative precision vs accumulating in
double from the start.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle a1119266c1 Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h
Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h,
Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the
CUB rewrite (commit 969b0a39), recovering the original primary function names
(sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the
hand-rolled shared-memory reduction kernel.

Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the
old/new comparison sections that depended on sum_gpu_old.

The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small
types, and the smem sizing problem for the extraction pass has no clean
solution within the accelerator_for abstraction. The right improvement is
a higher radix (12 then 4) in sumD_gpu_large, applied directly to the
existing hand-rolled kernel.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle a0f00c0eca sumD_gpu_direct: revert to per-lane write; CUB handles Nsimd*osites inputs
Benchmarking showed the shared-memory lane-summation approach (843d6497)
was slower than writing each SIMD lane individually and letting CUB reduce
the full nlanes = osites*Nsimd array. CUB's device reduce is more efficient
over the larger input than the smem overhead + serialised lane-0 summation.
The smem approach also required overriding acceleratorThreads() to avoid
the block-size sizing problem. Restore the simpler per-lane path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle d358954a84 sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1)
Set acceleratorThreads to 1 before the extraction kernel so that
dim3(nsimd,1,1) blocks give exactly one site group per block and
__shared__ sobjD smem[nsimd] is correctly sized without depending on
the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane)
indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1]
and writes one sobjD per site. CUB then reduces osites elements instead
of osites*nsimd, reducing both store traffic and CUB work by Nsimd.
acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle aee00bdfb5 sumD_gpu_direct: one thread per SIMD lane using extractLane
Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide
loop) with one thread per lane calling extractLane() — O(1) per thread.
CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction
but leaves the per-lane sobjD store stride as a known remaining concern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle cf324b0fa1 Lattice_reduction_gpu_cub: define GRID_REDUCTION_TIMING in header
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle b314dc224d Lattice_reduction_gpu_cub: add GRID_REDUCTION_TIMING instrumentation
Guards accelerator_for and CUB DeviceReduce calls in sumD_gpu_direct
and sumD_gpu_large with #ifdef GRID_REDUCTION_TIMING to isolate where
time is spent in each path. Large path accumulates across all groups
and prints totals with words/nfull/rem context.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 1bbd62498e Lattice_reduction_gpu_cub: replace WordBundle4 with iVector<iScalar<scalarD>,4>
WordBundle4 was redundant with Grid's existing tensor infrastructure.
iVector<iScalar<scalarD>,4> already provides accelerator_inline operator+,
zeroit(), and sycl::is_device_copyable — no new type needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle f3c3b1c04b Test_reduction: add timing benchmark for new vs old reduction paths
Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and
sum_gpu_old (hand-rolled shared-memory) for each field type, with
5-call warmup and 100-call timed loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 069f98b253 skills: HPC battle-hardening skill files for GPU+MPI correctness
Six skill files encoding expertise for making codebases robust on
problematic HPC systems, covering: correctness verification
(double-run, fingerprinting, flight recorder), hang diagnosis,
GPU runtime correctness (premature barrier, infinite poll),
MPI correctness on heterogeneous systems (device buffer aliasing,
AARCH64 PLT corruption, deterministic reductions),
compiler validation, and communication/computation overlap pipeline
design.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle dfd0503eae Test_reduction: use separate float and double grids
Float fields require a grid constructed with vComplexF::Nsimd(); using
a double grid causes grid->_gsites to undercount the sites in float
vobjF, making the constant-field expected value wrong.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle c629b2e87e Rename scalarNorm2 to squaredSum in Test_reduction.cc
The function computes |sum|^2 — the squared magnitude of an aggregate sum —
not a norm. squaredSum makes clear that squaring is applied to the sum, not
to individual site values before summing, distinguishing it from sumOfSquares
(the squared L2 norm).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 7c8462abd1 Fix Zero() used on thrust::complex in WordBundle4 initialisation
Grid's Zero() sentinel is not assignable to thrust::complex<double>;
use scalarD(0) instead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 95a6a0bde7 Reinstate large/small dispatch in CUB reduction path; radix-4 word-bundle for large types
rocPRIM's DeviceReduce requires warpSize(64) threads each holding one element in shared
memory, so sizeof(T)*64 must fit in sharedMemPerBlock.  LatticePropagator::scalar_objectD
is 2304 bytes (64*2304 = 147 KB), exceeding the budget and triggering a compile-time
static_assert in limit_block_size.

Introduce sumD_gpu_direct (the original direct-CUB path, safe for small types) and a new
sumD_gpu_large that groups the vobj's vector_type words in bundles of 4, reducing each
bundle as WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always within budget).  If
words % 4 != 0, the final partial bundle is zero-padded.  sumD_gpu dispatches at compile
time via if constexpr on sizeof(sobjD) > 512.

For LatticePropagator (144 words) this gives 36 CUB launches instead of 144.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle bba328fac5 Add Test_reduction to tests/debug
Tests the new CUB/hipCUB/SYCL lattice reduction (sum_gpu) against the
preserved hand-rolled implementation (sum_gpu_old) for LatticeComplexF/D,
LatticeColourMatrixF/D and LatticePropagatorF/D.

Part a) gaussian random field: checks that old and new agree to within
float/double roundoff tolerance.
Part b) constant field (= 1.0, identity-matrix init): verifies
innerProduct(sum, sum) = Ncomp * V^2 where Ncomp counts the nonzero
diagonal scalar components per site (1 / Nc / Ns*Nc respectively).

Make.inc is auto-generated by scripts/filelist on bootstrap and is not
tracked; the new .cc file is all that is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 41362349f3 Rewrite lattice GPU reduction to use CUB, hipCUB, and SYCL reduction
Replace hand-rolled shared-memory reduction kernels (reduceBlock/reduceBlocks/
reduceKernel) and the global device variable retirementCount with a unified
CUB/hipCUB DeviceReduce::Reduce path for CUDA/HIP and sycl::reduction for SYCL.
No small/large split is needed: both CUB and sycl::reduction handle arbitrary
object sizes internally.

Old implementations preserved as sum_gpu_old / sumD_gpu_old etc. in the
original files for regression testing on GPU hardware.

Also add CLAUDE.md with build, test, and architecture guidance.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:34:30 -04:00
Peter Boyle 12e3499b6d Updated rocm 7 compile for ORNL 2026-05-21 12:28:42 -04:00
Peter Boyle 9576011011 Changed setup for ROCM 7, nasty LD_LIBRARY_PATH issues were committing
evils
2026-05-21 12:28:04 -04:00
Peter Boyle 155b34c1aa File list lost 2026-05-21 12:06:01 -04:00
Peter Boyle 982ffe9ebe Lattice_reduction_gpu: demote timing logs to Debug, disable by default
skills/mpi-heterogeneous: add Bug Class 4 for Frontier GTL/libamdhip64 ABI mismatch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:05:36 -04:00
Peter Boyle 0251ecaeab Test_planned_fft: fix PlannedFFT template parameter to use ::vector_object
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 18:13:38 -04:00
Peter Boyle 372a27d645 tests: add Test_planned_fft exercising PlannedFFT<vobj>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:59:24 -04:00
Peter Boyle 72b4a061f3 tests/fft: remove PlanDestroy calls (FFT handles plans per-call)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:54:41 -04:00
Peter Boyle 29198efabe FFT: add FFTbase, PlannedFFT; factor FFT_dim_execute free function
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:53:17 -04:00
Peter Boyle 50aa51f93a debug: add Test_hipfft_repro — reproducer for hipFFT PARSE_ERROR on ROCm 7
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:27:27 -04:00
Peter Boyle 79ccc81a86 tests/debug: add G=4 to hipfft fail reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:21:52 -04:00
Peter Boyle 3f0fdbb597 tests/debug: test hipMemset variant before cache is populated
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:10:16 -04:00
Peter Boyle ea57bd8f03 tests/debug: extend hipfft fail reproducer with hipMemset and sync variants
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:02:02 -04:00
Peter Boyle bdba5b8403 FFT: use host stack buffer in PlanCreate, not deviceVector
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:49:06 -04:00
Peter Boyle 58cc6ca9c0 tests/debug: add minimal hipfft ordering bug fail/pass pair
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:48:23 -04:00
Peter Boyle e5996b440d tests/debug: test plan-before-malloc vs malloc-before-plan ordering
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:40:17 -04:00
Peter Boyle ad9d03fd85 tests/debug: extend hipfft reproducer with Grid-realistic howmany and exec tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 19:19:59 -04:00
Peter Boyle 4de160ce20 tests/debug: add minimal hipfft plan-creation reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 17:52:59 -04:00
Peter Boyle fc8c8ce6e7 FFT HIP: use hipfftCreate+hipfftMakePlanMany instead of hipfftPlanMany 2026-05-19 17:29:28 -04:00
Peter Boyle ddbb7f07c8 FFT: pass nullptr for inembed/onembed in hipfftPlanMany to avoid HIPFFT_PARSE_ERROR 2026-05-19 17:15:21 -04:00
Peter Boyle a5a04929fb Merge pull request #492 from giltirn/develop
Fixes to support CUDA > 13
2026-05-19 15:26:58 -04:00
Peter Boyle 1e29c59bcc FFT: cache plans per vobj type across calls
Plans are created lazily on the first FFT_dim call and reused for all
subsequent calls on the same FFT object.  PlanCreate<vobj>() can be
called explicitly to pre-warm the cache.  PlanDestroy() must be called
before switching to a different vobj type; the destructor cleans up any
live plans automatically.

Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the
LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 15:12:10 -04:00
Peter Boyle b6abdc3845 Accelerator: lower default accelerator_threads from 16 to 8
Benchmark_dwf_fp32 on MI250X GCD: 1.7 TF/s at nt=8, ~300 GF/s at nt=16.
With Nsimd=8 (fp32, GEN_SIMD_WIDTH=64B), nt=8 gives exactly 64 threads =
one full AMD wavefront. Higher values double register demand per block and
hit a register-pressure cliff for stencil kernels.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 13:41:03 -04:00
Christopher Kelly 77b8657fcc Fixes to support CUDA > 13. Specifically, the CUDA header is no longer accidentally included within Grid's namespace, and the breaking change to cub::Sum() -> ::cuda::std::plus<>{} in CUDA-13 has been worked around 2026-05-19 12:22:14 -04:00
Peter Boyle 2fadd8bb62 Accelerator: raise default accelerator_threads from 2 to 16 2026-05-19 10:15:53 -04:00
Peter Boyle 60df2dd5d0 skills: add gpu-memory-performance.md
Documents the acceleratorThreads() default=2 trap, LambdaApply thread
mapping, coalescedRead/Write idiom, when to use __global__ vs
accelerator_for, and fused vs staged HBM access patterns.

Includes observed MI250X numbers from LatticePropagatorD reduction
(50 → 297 → 546 GB/s progression).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 10:03:32 -04:00
Peter Boyle 66b529b345 sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel
Replace the two-kernel pack+reduce sequence with a single fused kernel
packReduceKernel<R> that reads R words of each vobj at offset 'base'
and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating
the intermediate bundle buffer entirely.

HBM access per word-group drops from 3x (pack-read + pack-write +
reduce-read) to 1x.  Thread count comes from getNumBlocksAndThreads
(warpSize..256) rather than acceleratorThreads(), so occupancy is
correct regardless of the --accelerator-threads setting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 09:46:43 -04:00
Peter Boyle 1304172a93 Modified repack 2026-05-19 08:53:13 -04:00
Peter Boyle 1315d4604d Enable GRID_REDUCTION_TIMING unconditionally
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 22:14:00 -04:00
Peter Boyle a31af31328 Lattice_reduction_gpu: add GRID_REDUCTION_TIMING instrumentation
Uncomment #define GRID_REDUCTION_TIMING to enable per-phase timing output:

  sumD_gpu_reduce_words: pack time (accelerator_for) per R and base
  sumD_gpu_small:        reduceKernel+barrier time and D2H time separately
  sumD_gpu_large:        total wall time across all word groups

This lets us identify whether the large-type bottleneck is in the pack
kernel, the shared-memory reduction kernel, the barrier, or the D2H.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 22:13:30 -04:00
Peter Boyle 26c3c7d8f9 sumD_gpu_large: radix-12 word-bundle reduction replacing radix-1
Replace the word-by-word loop (one kernel launch per scalar word) with
sumD_gpu_reduce_words<R> which packs R consecutive vector_type words per
site into iVector<iScalar<vector>,R>, then calls the existing sumD_gpu_small
shared-memory kernel once for the whole bundle.

Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for
any final < 4 words.  For LatticePropagator (144 words = 12x12), this
reduces the kernel-launch count from 144 to 12 -- a 12x reduction.

Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD
lane extraction and double-precision promotion identically to the scalar
word case.  sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within
sharedMemPerBlock on all supported devices.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:56:45 -04:00
Peter Boyle 0650d7c7eb Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor
Accumulate in sobjD throughout rather than accumulating in sobj and
converting the final sum. For float fields this matters: summing N floats
then casting loses O(N*eps_float) relative precision vs accumulating in
double from the start.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:53:40 -04:00
Peter Boyle 068f95ad2d Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h
Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h,
Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the
CUB rewrite (commit 969b0a39), recovering the original primary function names
(sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the
hand-rolled shared-memory reduction kernel.

Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the
old/new comparison sections that depended on sum_gpu_old.

The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small
types, and the smem sizing problem for the extraction pass has no clean
solution within the accelerator_for abstraction. The right improvement is
a higher radix (12 then 4) in sumD_gpu_large, applied directly to the
existing hand-rolled kernel.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:52:18 -04:00
Peter Boyle f4fbf7c9ca sumD_gpu_direct: revert to per-lane write; CUB handles Nsimd*osites inputs
Benchmarking showed the shared-memory lane-summation approach (843d6497)
was slower than writing each SIMD lane individually and letting CUB reduce
the full nlanes = osites*Nsimd array. CUB's device reduce is more efficient
over the larger input than the smem overhead + serialised lane-0 summation.
The smem approach also required overriding acceleratorThreads() to avoid
the block-size sizing problem. Restore the simpler per-lane path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:23:15 -04:00
Peter Boyle 843d6497b2 sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1)
Set acceleratorThreads to 1 before the extraction kernel so that
dim3(nsimd,1,1) blocks give exactly one site group per block and
__shared__ sobjD smem[nsimd] is correctly sized without depending on
the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane)
indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1]
and writes one sobjD per site. CUB then reduces osites elements instead
of osites*nsimd, reducing both store traffic and CUB work by Nsimd.
acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:08:10 -04:00
Peter Boyle 747c167658 sumD_gpu_direct: one thread per SIMD lane using extractLane
Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide
loop) with one thread per lane calling extractLane() — O(1) per thread.
CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction
but leaves the per-lane sobjD store stride as a known remaining concern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 16:21:50 -04:00
Peter Boyle fca2c5dba0 Lattice_reduction_gpu_cub: define GRID_REDUCTION_TIMING in header
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 14:54:08 -04:00
Peter Boyle e12bc7f07c Lattice_reduction_gpu_cub: add GRID_REDUCTION_TIMING instrumentation
Guards accelerator_for and CUB DeviceReduce calls in sumD_gpu_direct
and sumD_gpu_large with #ifdef GRID_REDUCTION_TIMING to isolate where
time is spent in each path. Large path accumulates across all groups
and prints totals with words/nfull/rem context.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 14:23:44 -04:00
Peter Boyle dc6ae51cab Lattice_reduction_gpu_cub: replace WordBundle4 with iVector<iScalar<scalarD>,4>
WordBundle4 was redundant with Grid's existing tensor infrastructure.
iVector<iScalar<scalarD>,4> already provides accelerator_inline operator+,
zeroit(), and sycl::is_device_copyable — no new type needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 13:55:28 -04:00
Peter Boyle baa70d8ec9 Test_reduction: add timing benchmark for new vs old reduction paths
Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and
sum_gpu_old (hand-rolled shared-memory) for each field type, with
5-call warmup and 100-call timed loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:31:13 -04:00
Peter Boyle c93b338bdd skills: HPC battle-hardening skill files for GPU+MPI correctness
Six skill files encoding expertise for making codebases robust on
problematic HPC systems, covering: correctness verification
(double-run, fingerprinting, flight recorder), hang diagnosis,
GPU runtime correctness (premature barrier, infinite poll),
MPI correctness on heterogeneous systems (device buffer aliasing,
AARCH64 PLT corruption, deterministic reductions),
compiler validation, and communication/computation overlap pipeline
design.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:10:44 -04:00
Peter Boyle c0472aa0ec Test_reduction: use separate float and double grids
Float fields require a grid constructed with vComplexF::Nsimd(); using
a double grid causes grid->_gsites to undercount the sites in float
vobjF, making the constant-field expected value wrong.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:09:35 -04:00
Peter Boyle 09552cfd73 Rename scalarNorm2 to squaredSum in Test_reduction.cc
The function computes |sum|^2 — the squared magnitude of an aggregate sum —
not a norm. squaredSum makes clear that squaring is applied to the sum, not
to individual site values before summing, distinguishing it from sumOfSquares
(the squared L2 norm).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 23:15:11 -04:00
Peter Boyle 003fec509c Fix Zero() used on thrust::complex in WordBundle4 initialisation
Grid's Zero() sentinel is not assignable to thrust::complex<double>;
use scalarD(0) instead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 18:10:17 -04:00
Peter Boyle 773a82d87f Reinstate large/small dispatch in CUB reduction path; radix-4 word-bundle for large types
rocPRIM's DeviceReduce requires warpSize(64) threads each holding one element in shared
memory, so sizeof(T)*64 must fit in sharedMemPerBlock.  LatticePropagator::scalar_objectD
is 2304 bytes (64*2304 = 147 KB), exceeding the budget and triggering a compile-time
static_assert in limit_block_size.

Introduce sumD_gpu_direct (the original direct-CUB path, safe for small types) and a new
sumD_gpu_large that groups the vobj's vector_type words in bundles of 4, reducing each
bundle as WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always within budget).  If
words % 4 != 0, the final partial bundle is zero-padded.  sumD_gpu dispatches at compile
time via if constexpr on sizeof(sobjD) > 512.

For LatticePropagator (144 words) this gives 36 CUB launches instead of 144.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 16:55:58 -04:00
Peter Boyle 286c29d6fb Add Test_reduction to tests/debug
Tests the new CUB/hipCUB/SYCL lattice reduction (sum_gpu) against the
preserved hand-rolled implementation (sum_gpu_old) for LatticeComplexF/D,
LatticeColourMatrixF/D and LatticePropagatorF/D.

Part a) gaussian random field: checks that old and new agree to within
float/double roundoff tolerance.
Part b) constant field (= 1.0, identity-matrix init): verifies
innerProduct(sum, sum) = Ncomp * V^2 where Ncomp counts the nonzero
diagonal scalar components per site (1 / Nc / Ns*Nc respectively).

Make.inc is auto-generated by scripts/filelist on bootstrap and is not
tracked; the new .cc file is all that is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 14:31:33 -04:00
Peter Boyle 969b0a3922 Rewrite lattice GPU reduction to use CUB, hipCUB, and SYCL reduction
Replace hand-rolled shared-memory reduction kernels (reduceBlock/reduceBlocks/
reduceKernel) and the global device variable retirementCount with a unified
CUB/hipCUB DeviceReduce::Reduce path for CUDA/HIP and sycl::reduction for SYCL.
No small/large split is needed: both CUB and sycl::reduction handle arbitrary
object sizes internally.

Old implementations preserved as sum_gpu_old / sumD_gpu_old etc. in the
original files for regression testing on GPU hardware.

Also add CLAUDE.md with build, test, and architecture guidance.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 13:41:56 -04:00
Peter Boyle f8b2eacf99 File list issue (Ed Bennets pull request?) 2026-05-15 12:57:42 -04:00
Peter Boyle 6140ac6864 Hip Happy 2026-05-15 12:13:01 -04:00
Peter Boyle c6c2834e03 Hip Happy 2026-05-15 11:30:29 -04:00
Peter Boyle 856545a1db Support ROCM 7.0.2 2026-05-15 11:30:29 -04:00
Peter Boyle e2d607f6c7 Merge pull request #490 from jdmaia/hip-guard-acceleratorfor2dNB
[HIP] Including kernel launch parameter guard on accelerator_for2dNB
2026-05-06 14:51:30 -04:00
Julio Maia 66da4e0657 Including guard on accelerator_for2dNB against invalid kernel configurations if GRID_HIP 2026-05-06 13:26:33 -05:00
Peter Boyle b37390bb5a 4 node usqcd run 2026-04-27 14:40:11 -07:00
Peter Boyle 829dc8cceb 32 node 2026-04-27 14:38:02 -07:00
Peter Boyle 13cc2c39f5 FOM run 2026-04-27 14:20:49 -07:00
Peter Boyle 66ea3b271c Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2026-04-27 13:55:52 -07:00
Peter Boyle d293b58a20 384 node baseline run 2026-04-27 13:54:40 -07:00
Peter Boyle ce093b2bf3 rdtsc 2026-04-27 13:54:06 -07:00
Peter Boyle e4404efe5a Perlmutter compile update 2026-04-27 13:53:28 -07:00
Peter Boyle 5ce270f1de Adding Claude related files 2026-04-21 10:41:18 -04:00
Peter Boyle af43b067a0 New CLAUDE controllable visualiser 2026-04-10 11:23:25 -04:00
Quadro 34b44d1fee New file for animation in MD time direction 2026-04-02 13:55:38 -04:00
Peter Boyle 595ceaac37 Include grid header and make the ENABLE correct 2026-03-11 17:24:44 -04:00
Peter Boyle daf5834e8e Fixing incorrect PR about disable fermion instantiations 2026-03-11 17:05:46 -04:00
Peter Boyle 0d8658a039 Optimised 2026-03-05 06:06:32 -05:00
Peter Boyle 095e004d01 Setup change GCR 2026-03-05 06:06:32 -05:00
Peter Boyle 0acabee7f6 Modest change 2026-03-05 06:06:32 -05:00
Peter Boyle 76fbcffb60 Improvement to 16^3 hdcg 2026-03-05 06:06:32 -05:00
Peter Boyle a0a62d7ead Merge pull request #478 from vataspro/PolyakovUpstream
Spatial Polyakov Loop implementation
2026-02-24 20:45:42 -05:00
Peter Boyle c5038ea6a5 Merge pull request #483 from cmcknigh/bugfix/rocm7-rocblas-type-refactor
Adding a version check to handle rocBlas type refactor
2026-02-24 20:45:03 -05:00
Peter Boyle a5120903eb Merge pull request #486 from RChrHill/fix/sp4-fp32
Define Sp4 ProjectOnGeneralGroup for generic vtype
2026-02-24 20:44:08 -05:00
Peter Boyle 00b286a08a Merge pull request #488 from RChrHill/feature/additional-ET-traces
Add ET support for Lattice spin- and colour-traces
2026-02-24 20:43:45 -05:00
Peter Boyle 24a9759353 Merge pull request #485 from edbennett/skip-fermion-instantiations
Be able to skip compiling fermion instantiations altogether
2026-02-24 20:43:20 -05:00
edbennett 1b56f6f46d be able to skip compiling fermion instantiations altogether 2026-02-24 23:52:18 +00:00
Peter Boyle 2a8084d569 Subspace setup 2026-02-13 17:26:11 -05:00
Peter Boyle 6ff29f9d4f Alternate multigrids 2026-02-13 17:25:45 -05:00
RChHill c4d3e79193 Add ET support for Lattice spin- and colour-traces 2026-01-29 14:46:52 +00:00
Peter Boyle 7cd3f21e6b preserving a bunch of experiments on setup and g5 subspace doubling 2026-01-06 05:57:39 -05:00
paboyle 4a0aaf0786 Fix issue with Aurora compilers 2025-11-21 21:41:13 +00:00
paboyle 9c3835524c Fix compile warn 2025-11-21 21:41:12 +00:00
paboyle 549351bb8a Stag verbose clean up 2025-11-20 18:22:57 +00:00
RChHill b650b89682 Define Sp4 ProjectOnGeneralGroup for generic vtype 2025-11-19 13:26:52 +00:00
Peter Boyle 74e6b19f83 Looks like the reuse of xfers in staggered has bugs or corner cases depending on volume 2025-11-17 22:29:06 -05:00
Peter Boyle 2e684028de Improvements 2025-11-14 18:12:27 -05:00
paboyle c54d87a472 Aurora compile fix for new compiler 2025-11-06 18:17:33 +00:00
Allen McKnight 4304245c1b Merge branch 'develop' into bugfix/rocm7-rocblas-type-refactor 2025-11-04 08:50:11 -06:00
Peter Boyle 6165931afa Update GridStd.h 2025-10-03 14:35:37 -04:00
Your Name 1d1fd3bcaf adding a version check to handle rocblas type change 2025-10-02 15:24:24 -05:00
paboyle 23581333e6 link cufft 2025-08-21 22:25:55 +01:00
paboyle e5fa3d887f Compile on CUDA 2025-08-21 22:10:27 +01:00
paboyle 583fa7bb0a FFTW guarded after CUDA adn HIP 2025-08-21 22:00:12 +01:00
Peter Boyle fe0db53842 FFT offload to GPU and MUCH faster comms.
40x speed up on Frontier
2025-08-21 16:45:38 -04:00
Peter Boyle 76c0ada1e1 Benchmark for En Hung 2025-08-21 16:45:38 -04:00
Peter Boyle 92f49e9194 Merge pull request #482 from g-simonetti/wflow_sp2n_paboyle
Fixed Wilson flow for Nc not equal to 3
2025-08-21 09:10:25 -04:00
Peter Boyle 44c8057b5f Merge pull request #481 from vataspro/sp-reps-fix
Only compile higher fermion representations for symplectic gauge group when requested via configure flag
2025-08-20 12:57:28 -04:00
Alexis Provatas 0ad837f595 Fix Sp representations compilation 2025-08-20 17:48:39 +01:00
Peter Boyle bd2103c746 Merge pull request #480 from vataspro/fix-no-comms
Fix enable-comms=none
2025-08-20 12:26:47 -04:00
Alexis Provatas 9c18d2ddb0 Fix StencilSendToRecvFromBegin to agree with base 2025-08-20 17:17:06 +01:00
g-simonetti 1245a8c151 num_colours added to class S 2025-08-20 16:27:34 +01:00
g-simonetti 07113dc8ba Changed beta=3 to beta=Nc with comments 2025-08-20 16:18:34 +01:00
paboyle a3420e6fa9 Update for grid view logging 2025-08-14 21:29:20 +00:00
paboyle 732836d9f8 Missed one 2025-08-14 20:25:54 +00:00
paboyle 87658f7b53 ASSERT tripped in Shuhei's branch 2025-08-14 20:08:54 +00:00
paboyle e7f51e5fb1 Timer pointers for hadrons compat.
Reluctantly, this interface is silly to pass timers around.
2025-08-11 21:11:36 +01:00
Peter Boyle 1ce5f70dd1 Update GridStd.h 2025-08-11 12:20:54 -04:00
Peter Boyle 473635f401 Update BinaryIO.h 2025-08-11 11:06:06 -04:00
paboyle 5adf2657dd Updated to compile and run fast on CUDA 2025-08-10 00:00:13 +01:00
paboyle 82cfff2990 A2A meson field BLAS based momentum project 2025-08-07 15:51:15 +00:00
paboyle 4397b1c442 Debugged momentum projection for A2A Meson Field 2025-08-07 15:51:01 +00:00
paboyle 9e6a4a4737 Assertion updates to macros (mostly) with backtrace.
WIlson flow to include options for DBW2, Iwasaki, Symanzik.
View logging for data assurance
2025-08-07 15:48:38 +00:00
paboyle 41f344bbd3 Merge with Christoph GPT checksum debug 2025-07-15 03:06:09 +00:00
paboyle a77cd50b2f Update comms logging in Cshift 2025-07-11 14:36:10 +00:00
Alexis Provatas c646d91527 Fix names, protect against bad index values, clean docstrings 2025-05-01 10:52:00 +01:00
Alexis Provatas a2b98d82e1 remove obsolete spatial polyakov observable file 2025-05-01 10:52:00 +01:00
Alexis Provatas 7b9415c088 Move observable logger to Polyakov Loop file and fix docstring 2025-05-01 10:52:00 +01:00
Alexis Provatas cb7110f492 Add Spatial Polyakov Loop observable 2025-05-01 10:52:00 +01:00
Alexis Provatas 0c7af66490 Create Spatial Polyakov Observable Module 2025-05-01 10:52:00 +01:00
Alexis Provatas 496d1b914a Generalise Polyakov loop and overload for temporal direction 2025-05-01 10:52:00 +01:00
387 changed files with 12789 additions and 2853 deletions
+24 -24
View File
@@ -120,7 +120,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(ba
cudaGetErrorString( err )); \
printf("File %s Line %d\n",__FILE__,__LINE__); \
fflush(stdout); \
if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
if (acceleratorAbortOnGpuError) GRID_ASSERT(err==cudaSuccess); \
} \
}
@@ -168,7 +168,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -276,11 +276,11 @@ public:
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
GRID_ASSERT(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
GRID_ASSERT(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
@@ -305,8 +305,8 @@ public:
{
RealD t2=usecond();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -341,7 +341,7 @@ public:
(hipblasDoubleComplex *) Bkn, ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc);
assert(err==HIPBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -361,7 +361,7 @@ public:
(cuDoubleComplex *) Bkn, ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -433,8 +433,8 @@ public:
{
RealD t2=usecond();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -469,7 +469,7 @@ public:
(hipblasComplex *) Bkn, ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex *) Cmn, ldc);
assert(err==HIPBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -489,7 +489,7 @@ public:
(cuComplex *) Bkn, ldb,
(cuComplex *) &beta_p[0],
(cuComplex *) Cmn, ldc);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -595,11 +595,11 @@ public:
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -636,7 +636,7 @@ public:
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -657,7 +657,7 @@ public:
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -804,8 +804,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -821,8 +821,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
@@ -843,7 +843,7 @@ public:
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -864,7 +864,7 @@ public:
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
+98
View File
@@ -0,0 +1,98 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## What This Is
Grid is a data-parallel C++ library for lattice QCD. It provides SIMD-vectorised lattice containers, MPI-based domain decomposition, GPU acceleration (CUDA/HIP/SYCL), and a full suite of QCD algorithms including HMC.
## Build
Uses GNU Autotools. The bootstrap step only needs to run once (or after `configure.ac` changes).
```bash
./bootstrap.sh # downloads Eigen 3.4.0, generates configure
mkdir build && cd build
../configure [options]
make -j$(nproc)
make check # run root-level tests
make install
```
Key configure options:
| Option | Common values |
|--------|---------------|
| `--enable-simd=` | `AVX2`, `AVX512`, `KNL`, `A64FX`, `NEONv8`, `GPU` |
| `--enable-comms=` | `mpi-auto`, `mpi3-auto`, `none` |
| `--enable-accelerator=` | `cuda`, `hip`, `sycl` |
| `--enable-shm=` | `shmopen`, `hugetlbfs`, `nvlink` |
| `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
| `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
| `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
Platform recipes from `README.md`:
- **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
- **Skylake/Haswell**: `--enable-simd=AVX512` or `AVX2` + `--enable-comms=mpi3-auto`
- **AMD EPYC**: `--enable-simd=AVX2 --enable-comms=mpi3`
- **A64FX (Fugaku)**: `--enable-simd=A64FX --enable-comms=mpi3 --enable-shm=shmget` (see `SVE_README.txt`)
Required external libs: GMP, MPFR, OpenSSL, zlib.
## Running Tests
```bash
# From build directory
make check # root-level tests (Test_simd, Test_cshift, etc.)
make -C tests/<subdir> tests # build tests in a subdirectory
./tests/core/Test_simd # run a single test binary directly
```
Test subdirectories and their focus: `core` (SIMD, stencil, comms), `solver` (CG, GMRES, eigensolvers), `hmc` (MD integrators), `forces` (fermion forces), `lanczos`, `IO`, `smearing`, `sp2n`, `debug`.
## Architecture
### Layer stack (bottom to top)
1. **SIMD layer** (`Grid/simd/`) — platform-specific intrinsics wrapped into `vRealF`, `vComplexD`, etc. The SIMD width and layout are compile-time constants controlled by `--enable-simd`.
2. **Tensor layer** (`Grid/tensors/`) — Lorentz/colour/spin tensor algebra built on top of SIMD types. `iMatrix`, `iVector`, `iScalar` templates compose into QCD types like `ColourMatrix`, `SpinColourVector`.
3. **Lattice layer** (`Grid/lattice/`) — `Lattice<T>` container: a site-local tensor replicated across a distributed Cartesian grid. All arithmetic is site-parallel and expression-template-fused.
4. **Cartesian/comms layer** (`Grid/cartesian/`, `Grid/communicator/`) — `GridCartesian` holds the MPI topology and local/global geometry. `Grid/cshift/` implements nearest-neighbour halo exchange; `Grid/stencil/` is the optimised multi-hop stencil used by Dirac operators.
5. **Algorithm layer** (`Grid/algorithms/`) — iterative solvers (CG, GMRES, BiCGSTAB, mixed-precision), eigensolvers (Lanczos, LAPACK), FFT, smearing.
6. **QCD layer** (`Grid/qcd/`) — gauge and fermion actions, HMC integrators, observables.
### QCD subsystem (`Grid/qcd/`)
- `action/fermion/` — Wilson, Clover, DWF (Mobius), Staggered, twisted-mass, G-parity variants
- `action/gauge/` — Wilson gauge, Symanzik, Iwasaki, DBW2, plaquette+rect
- `representations/` — Fundamental, Adjoint, Two-index, Sp(2n)
- `hmc/` — Leapfrog, OMF2/OMF4 integrators; pseudofermion refreshment; Metropolis accept/reject
- `smearing/` — APE, Stout, HEX, gradient flow
- `observables/` — Polyakov loop, plaquette, topological charge
### GPU acceleration
GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). The `Grid/simd/` SIMD types map to scalar on GPU device code; host code paths remain vectorised. Unified virtual memory is on by default (`--enable-unified=yes`); device-aware MPI (`--enable-accelerator-aware-mpi`) avoids device→host copies on transfers.
### Memory and I/O
- `Grid/allocator/` — aligned/NUMA-aware allocators; caching allocator via `--enable-alloc-cache`
- `Grid/parallelIO/` — distributed parallel reader/writer for ILDG (via LIME), SciDAC, and native binary formats
- `Grid/serialisation/` — text, binary, HDF5, XML/JSON serialisation of arbitrary Grid objects
### HMC applications
`HMC/` contains production-ready HMC driver programmes (e.g. `Mobius2p1f.cc`, `DWF_plus_DSDR_nf2plus1_Shamir_Gparity.cc`). These are built separately from the library tests.
## Key Conventions
- **C++17** is required throughout.
- Template structure: most classes are templated on `<_FImpl>` (fermion impl) or `<Gimpl>` (gauge impl), which encode the representation and precision. Instantiation is controlled by `--enable-fermion-instantiations`.
- The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
- Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
- Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
+39 -4
View File
@@ -1,9 +1,17 @@
#ifndef GRID_STD_H
#define GRID_STD_H
///////////////////
// Grid config
///////////////////
#include "Config.h"
///////////////////
// Std C++ dependencies
///////////////////
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <cassert>
#include <complex>
#include <memory>
@@ -15,7 +23,9 @@
#include <random>
#include <functional>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
@@ -23,11 +33,36 @@
#include <sys/time.h>
#include <chrono>
#include <zlib.h>
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
void GridAbort(void);
#define ASSLOG(A) ::write(STDERR_FILENO,A,::strlen(A));
#ifdef HAVE_EXECINFO_H
#define GRID_ASSERT(b) if(!(b)) { \
fflush(stdout); \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); \
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO); \
GridAbort(); \
};
#else
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
GridAbort(); \
};
#endif
///////////////////
// Grid config
///////////////////
#include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
+11 -7
View File
@@ -54,21 +54,25 @@ Version.h: version-cache
include Make.inc
include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_FERMION_INSTANTIATIONS
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
if BUILD_FERMION_REPS
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
endif
endif
lib_LIBRARIES = libGrid.a
+1 -1
View File
@@ -29,8 +29,8 @@ directory
#pragma once
#include <type_traits>
#include <cassert>
#include <exception>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
+2
View File
@@ -51,6 +51,8 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
+427 -235
View File
@@ -1,6 +1,6 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cshift.h
@@ -28,6 +28,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
#ifdef GRID_CUDA
#include <cufft.h>
#endif
#ifdef GRID_HIP
#include <hipfft/hipfft.h>
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h>
@@ -35,266 +44,449 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <fftw3.h>
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_ESTIMATE (0)
#endif
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
template<class scalar> struct FFTW {
};
#ifdef GRID_HIP
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
};
~FFT ( void) {
delete sgrid;
typedef hipfftDoubleComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) { hipfftDestroy(p); }
};
#endif
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
#ifdef GRID_CUDA
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftDoubleComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) { cufftDestroy(p); }
};
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftw_destroy_plan(p); }
};
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) { ::fftwf_destroy_plan(p); }
};
#endif
#endif
struct FFTbase {
double flops;
double flops_call;
uint64_t usec;
GridCartesian *_grid;
static const int forward = FFTW_FORWARD;
static const int backward = FFTW_BACKWARD;
double Flops(void) { return flops; }
double MFlops(void) { return flops / usec; }
double USec(void) { return (double)usec; }
FFTbase(GridCartesian *grid) : _grid(grid), flops(0), flops_call(0), usec(0) {}
};
// Barrel-shift gather, FFT execute, and insert. Called by both FFT and PlannedFFT.
// The caller is responsible for plan acquisition and destruction.
template<class vobj>
static void FFT_dim_execute(
Lattice<vobj> &result,
const Lattice<vobj> &source,
int dim, int sign,
typename FFTW<typename vobj::scalar_type>::FFTW_plan p,
GridCartesian *grid,
double &flops, double &flops_call, uint64_t &usec)
{
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
const int Ndim = grid->Nd();
int L = grid->_ldimensions[dim];
int G = grid->_fdimensions[dim];
int Ncomp = sizeof(sobj) / sizeof(scalar);
int64_t Nlow = 1, Nhigh = 1;
for (int d = 0; d < dim; d++) Nlow *= grid->_ldimensions[d];
for (int d = dim+1; d < Ndim; d++) Nhigh *= grid->_ldimensions[d];
int64_t Nperp = Nlow * Nhigh;
deviceVector<scalar> pgbuf(Nperp * Ncomp * G);
scalar *pgbuf_v = &pgbuf[0];
int howmany = Ncomp * Nperp;
scalar div;
if (sign == FFTW_BACKWARD) div = 1.0 / G;
else if (sign == FFTW_FORWARD) div = 1.0;
else GRID_ASSERT(0);
double t_pencil = 0, t_fft = 0, t_copy = 0, t_shift = 0;
double t_total = -usecond();
result = source;
int pc = grid->_processor_coor[dim];
const Coordinate ldims = grid->_ldimensions;
const Coordinate rdims = grid->_rdimensions;
const Coordinate sdims = grid->_simd_layout;
const Coordinate processors = grid->_processors;
Coordinate pgdims(Ndim);
pgdims[0] = G;
for (int d = 0, dd = 1; d < Ndim; d++)
if (d != dim) pgdims[dd++] = ldims[d];
int64_t pgvol = 1;
for (int d = 0; d < Ndim; d++) pgvol *= pgdims[d];
const int Nsimd = vobj::Nsimd();
t_pencil = -usecond();
for (int p_idx = 0; p_idx < processors[dim]; p_idx++) {
t_copy -= usecond();
autoView(r_v, result, AcceleratorRead);
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
#ifdef GRID_SIMT
{
int lane = acceleratorSIMTlane(Nsimd);
#else
for (int lane = 0; lane < Nsimd; lane++) {
#endif
Coordinate icoor, ocoor, pgcoor;
Lexicographic::CoorFromIndex(icoor, lane, sdims);
Lexicographic::CoorFromIndex(ocoor, idx, rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p_idx)%processors[dim])*L;
for (int d = 0, dd = 1; d < Ndim; d++)
if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
vector_type *from = (vector_type *)&r_v[idx];
scalar_type stmp;
for (int w = 0; w < Ncomp; w++) {
stmp = getlane(from[w], lane);
pgbuf_v[pgidx + w*pgvol] = stmp;
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
t_copy += usecond();
if (p_idx != processors[dim] - 1) {
Lattice<vobj> temp(grid);
t_shift -= usecond();
temp = Cshift(result, dim, L); result = temp;
t_shift += usecond();
}
}
t_pencil += usecond();
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
FFTW_scalar *out = (FFTW_scalar *)pgbuf_v;
t_fft = -usecond();
FFTW<scalar>::fftw_execute_dft(p, in, out, sign);
t_fft += usecond();
flops_call = 5.0 * howmany * G * log2(G);
usec = t_fft;
flops = flops_call;
result = Zero();
double t_insert = -usecond();
{
autoView(r_v, result, AcceleratorWrite);
accelerator_for(idx, grid->oSites(), Nsimd, {
#ifdef GRID_SIMT
{
int lane = acceleratorSIMTlane(Nsimd);
#else
for (int lane = 0; lane < Nsimd; lane++) {
#endif
Coordinate icoor(Ndim), ocoor(Ndim), pgcoor(Ndim);
Lexicographic::CoorFromIndex(icoor, lane, sdims);
Lexicographic::CoorFromIndex(ocoor, idx, rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
for (int d = 0, dd = 1; d < Ndim; d++)
if (d != dim) { pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d]; dd++; }
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor, pgidx, pgdims);
vector_type *to = (vector_type *)&r_v[idx];
scalar_type stmp;
for (int w = 0; w < Ncomp; w++) {
stmp = pgbuf_v[pgidx + w*pgvol];
putlane(to[w], stmp, lane);
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
result = result * div;
t_insert += usecond();
t_total += usecond();
std::cout << GridLogPerformance << " FFT took " << t_total/1.0e6 << " s" << std::endl;
std::cout << GridLogPerformance << " FFT pencil " << t_pencil/1.0e6 << " s" << std::endl;
std::cout << GridLogPerformance << " of which copy " << t_copy/1.0e6 << " s" << std::endl;
std::cout << GridLogPerformance << " of which shift" << t_shift/1.0e6 << " s" << std::endl;
std::cout << GridLogPerformance << " FFT kernels " << t_fft/1.0e6 << " s" << std::endl;
std::cout << GridLogPerformance << " FFT insert " << t_insert/1.0e6 << " s" << std::endl;
}
class FFT : public FFTbase {
public:
FFT(GridCartesian *grid) : FFTbase(grid) {}
~FFT() {}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result, const Lattice<vobj> &source, Coordinate mask, int sign) {
const int Ndim = _grid->Nd();
Lattice<vobj> tmp = source;
for (int d = 0; d < Ndim; d++) {
if (mask[d]) {
FFT_dim(result, tmp, d, sign);
tmp = result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
void FFT_all_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int sign) {
Coordinate mask(_grid->Nd(), 1);
FFT_dim_mask(result, source, mask, sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0);
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
void FFT_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int dim, int sign) {
GRID_ASSERT(source.Grid() == _grid);
GRID_ASSERT(result.Grid() == _grid);
conformable(result.Grid(), source.Grid());
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
pokeLocalSite(s,p_v,cbuf);
});
}
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
}
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result_v,clbuf);
});
}
result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
const int Ndim = _grid->Nd();
int G = _grid->_fdimensions[dim];
int Ncomp = sizeof(sobj) / sizeof(scalar);
int64_t Nperp = 1;
for (int d = 0; d < Ndim; d++)
if (d != dim) Nperp *= _grid->_ldimensions[d];
int n[] = {G};
int howmany = Ncomp * Nperp;
deviceVector<scalar> dummy(2);
FFTW_scalar *buf = (FFTW_scalar *)&dummy[0];
FFTW_plan p = FFTW<scalar>::fftw_plan_many_dft(1, n, howmany,
buf, n, 1, G,
buf, n, 1, G,
sign, FFTW_ESTIMATE);
FFT_dim_execute(result, source, dim, sign, p, _grid, flops, flops_call, usec);
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
template<class vobj>
class PlannedFFT : public FFTbase {
private:
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
std::vector<FFTW_plan> forward_plans;
std::vector<FFTW_plan> backward_plans;
void PlanCreate() {
const int Ndim = _grid->Nd();
forward_plans.resize(Ndim);
backward_plans.resize(Ndim);
for (int d = 0; d < Ndim; d++) {
int G = _grid->_fdimensions[d];
int Ncomp = sizeof(sobj) / sizeof(scalar);
int64_t Nperp = 1;
for (int dd = 0; dd < Ndim; dd++)
if (dd != d) Nperp *= _grid->_ldimensions[dd];
int howmany = Ncomp * (int)Nperp;
int n[] = {G};
deviceVector<scalar> dummy(2);
FFTW_scalar *buf = (FFTW_scalar *)&dummy[0];
forward_plans[d] = FFTW<scalar>::fftw_plan_many_dft(1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_FORWARD, FFTW_ESTIMATE);
backward_plans[d] = FFTW<scalar>::fftw_plan_many_dft(1, n, howmany, buf, n, 1, G, buf, n, 1, G, FFTW_BACKWARD, FFTW_ESTIMATE);
}
}
void PlanDestroy() {
for (auto p : forward_plans) FFTW<scalar>::fftw_destroy_plan(p);
for (auto p : backward_plans) FFTW<scalar>::fftw_destroy_plan(p);
forward_plans.clear();
backward_plans.clear();
}
public:
PlannedFFT(GridCartesian *grid) : FFTbase(grid) { PlanCreate(); }
~PlannedFFT() { PlanDestroy(); }
void FFT_dim_mask(Lattice<vobj> &result, const Lattice<vobj> &source, Coordinate mask, int sign) {
const int Ndim = _grid->Nd();
Lattice<vobj> tmp = source;
for (int d = 0; d < Ndim; d++) {
if (mask[d]) {
FFT_dim(result, tmp, d, sign);
tmp = result;
}
}
}
void FFT_all_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int sign) {
Coordinate mask(_grid->Nd(), 1);
FFT_dim_mask(result, source, mask, sign);
}
void FFT_dim(Lattice<vobj> &result, const Lattice<vobj> &source, int dim, int sign) {
GRID_ASSERT(source.Grid() == _grid);
GRID_ASSERT(result.Grid() == _grid);
GRID_ASSERT((int)forward_plans.size() == _grid->Nd());
conformable(result.Grid(), source.Grid());
FFTW_plan p = (sign == forward ? forward_plans : backward_plans)[dim];
FFT_dim_execute(result, source, dim, sign, p, _grid, flops, flops_call, usec);
}
};
+25 -25
View File
@@ -64,7 +64,7 @@ public:
//
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes
/////////////////////////////////////////////////////////////////////////////////////////////
@@ -148,22 +148,22 @@ public:
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
assert(0);
GRID_ASSERT(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
assert(0);
GRID_ASSERT(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
GRID_ASSERT(0);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
assert(0);
GRID_ASSERT(0);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
assert(0);
GRID_ASSERT(0);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
@@ -188,13 +188,13 @@ public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
GRID_ASSERT(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
GRID_ASSERT(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
GRID_ASSERT(0);
};
void Op (const Field &in, Field &out){
HermOp(in,out);
@@ -271,10 +271,10 @@ public:
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
GRID_ASSERT(0);
}
};
template<class Matrix,class Field>
@@ -303,10 +303,10 @@ public:
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
GRID_ASSERT(0);
}
};
@@ -345,13 +345,13 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system
GRID_ASSERT(0); // must coarsen the unpreconditioned system
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
GRID_ASSERT(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
GRID_ASSERT(0);
};
};
template<class Matrix,class Field>
@@ -447,10 +447,10 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0);
GRID_ASSERT(0);
}
virtual void HermOp(const Field& in, Field& out) {
assert(0);
GRID_ASSERT(0);
}
void Op(const Field& in, Field& out) {
Mpc(in, out);
@@ -460,13 +460,13 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
}
// Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system
GRID_ASSERT(0); // must coarsen the unpreconditioned system
}
void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0);
GRID_ASSERT(0);
}
void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0);
GRID_ASSERT(0);
};
};
@@ -580,7 +580,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
GRID_ASSERT( _Mat.isTrivialEE() );
mass = _Mat.Mass();
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -611,7 +611,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered
GRID_ASSERT(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -623,7 +623,7 @@ template<class Field> class OperatorFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size());
GRID_ASSERT(in.size()==out.size());
for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]);
}
@@ -637,7 +637,7 @@ public:
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
assert(in.size() == out.size());
GRID_ASSERT(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i)
{
+3 -3
View File
@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
// Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
assert(a_len<=SUM_MAX);
GRID_ASSERT(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2];
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
equations();
if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n";
assert(0);
GRID_ASSERT(0);
};
assert( delta>= tolerance);
GRID_ASSERT( delta>= tolerance);
search(step);
}
+1 -1
View File
@@ -134,7 +134,7 @@ class AlgRemez
virtual ~AlgRemez();
int getDegree(void){
assert(n==d);
GRID_ASSERT(n==d);
return n;
}
// Reset the bounds of the approximation
+8 -8
View File
@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
pow_n = num_degree;
pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0);
num_type = num_type_in;
den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
equations();
if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
assert(0);
GRID_ASSERT(0);
};
assert( delta>= tolerance );
GRID_ASSERT( delta>= tolerance );
search();
}
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x;
}
assert(t == n+1);
GRID_ASSERT(t == n+1);
z = (bigfloat)1l;
t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x;
}
assert(t == d);
GRID_ASSERT(t == d);
B[i] = y * z; // Right hand side vector
}
+1 -1
View File
@@ -106,7 +106,7 @@ class AlgRemezGeneral{
bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{
assert(n==d);
GRID_ASSERT(n==d);
return n;
}
// Reset the bounds of the approximation
+1 -1
View File
@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){
assert(omega_in.size() == Ls_in);
GRID_ASSERT(omega_in.size() == Ls_in);
omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial
+515 -75
View File
@@ -28,6 +28,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
#pragma once
#ifdef GRID_HIP
#include <hip/hip_version.h>
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
@@ -65,6 +66,7 @@ NAMESPACE_BEGIN(Grid);
#endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
class GridBLAS {
public:
@@ -97,7 +99,22 @@ public:
gridblasInit=1;
}
}
#ifdef GRID_CUDA
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
switch (p) {
case GridBLAS_PRECISION_16F:
return CUBLAS_COMPUTE_32F_FAST_16F;
case GridBLAS_PRECISION_16BF:
return CUBLAS_COMPUTE_32F_FAST_16BF;
case GridBLAS_PRECISION_TF32:
return CUBLAS_COMPUTE_32F_FAST_TF32;
default:
GRID_ASSERT(0);
}
return CUBLAS_COMPUTE_32F_FAST_16F;
}
#endif
// Force construct once
GridBLAS() { Init(); };
~GridBLAS() { };
@@ -119,11 +136,11 @@ public:
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
GRID_ASSERT(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
GRID_ASSERT(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
@@ -138,8 +155,10 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
deviceVector<ComplexD*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
@@ -201,12 +220,14 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
deviceVector<ComplexD*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
@@ -235,18 +256,31 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
(hipDoubleComplex *) &alpha_p[0],
(hipDoubleComplex **)&Amk[0], lda,
(hipDoubleComplex **)&Bkn[0], ldb,
(hipDoubleComplex *) &beta_p[0],
(hipDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
#else
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -267,7 +301,7 @@ public:
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -448,7 +482,8 @@ public:
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
deviceVector<ComplexF*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
@@ -470,9 +505,10 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -481,18 +517,32 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
(hipComplex *) &alpha_p[0],
(hipComplex **)&Amk[0], lda,
(hipComplex **)&Bkn[0], ldb,
(hipComplex *) &beta_p[0],
(hipComplex **)&Cmn[0], ldc,
batchCount);
#else
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -503,50 +553,68 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
cublasStatus_t err;
if (precision == GridBLAS_PRECISION_DEFAULT) {
err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
} else {
cublasComputeType_t compute_precision = toDataType(precision);
err = cublasGemmBatchedEx(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(void *) &alpha_p[0],
(void **)&Amk[0], CUDA_C_32F, lda,
(void **)&Bkn[0], CUDA_C_32F, ldb,
(void *) &beta_p[0],
(void **)&Cmn[0], CUDA_C_32F, ldc,
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
}
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
@@ -643,8 +711,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
GRID_ASSERT(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -660,8 +728,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
@@ -681,7 +749,8 @@ public:
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -702,7 +771,7 @@ public:
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -802,8 +871,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
GRID_ASSERT(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -820,8 +889,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
GRID_ASSERT(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
@@ -841,7 +910,8 @@ public:
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
@@ -862,7 +932,7 @@ public:
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
@@ -946,6 +1016,376 @@ public:
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
/*
Inverse and Determinant
- CPU version uses Eigen
- GPU version uses LAPACK-compatible getrf / getri
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
Overhead to go through getrf / getri for CPU version too large.
Current interface therefore only guarantees the inverse and determinant
functions on all platforms but not the getrf / getri ones.
*/
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
void inverseBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void inverseBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
#else
#ifdef GRID_SYCL
template<typename T>
void getrfBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
&n, &n,
(T **)&Ann[0],
&n,
(int64_t**)&_ipiv[0],
(int64_t)1, &batchCount,
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
}
#endif
void getrfBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
void getrfBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
#ifdef GRID_SYCL
template<typename T>
void getriBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<T*> &Cnn) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
&n,
(T **)&Ann[0],
&n,
(int64_t**)p_ipiv,
(int64_t)1, &batchCount,
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
T** pA = &Ann[0];
T** pC = &Cnn[0];
accelerator_for(i, batchCount*n*n, 1, {
auto j = i / batchCount;
auto k = i % batchCount;
pC[k][j] = pA[k][j];
});
}
#endif
void getriBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexD*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
void getriBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexF*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<" "<<__LINE__<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
template<typename dtype>
void inverseBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
) {
int64_t batchCount = Ann.size();
RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
// test info for non-invertibility? set to nan if yes?
getriBatched(n, Ann, ipiv, info, Cnn);
//synchronise();
//RealD t2 = usecond();
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
}
template<typename dtype>
void determinantBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &C // this will be overwritten with determinant
) {
int64_t batchCount = Ann.size();
//RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
dtype** pAnn = (dtype**)&Ann[0];
dtype** pC = (dtype**)&C[0];
#if defined(GRID_CUDA) || defined(GRID_HIP)
int* pipiv = (int*)&ipiv[0];
#else
int64_t* pipiv = (int64_t*)&ipiv[0];
#endif
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
//RealD t2 = usecond();
accelerator_for(i,batchCount,1,{
dtype det = 1.0;
for (int64_t j=0;j<n;j++) {
det *= pAnn[i][n*j + j];
// branchless signs
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
}
*pC[i] = det;
});
//RealD t3 = usecond();
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
}
#endif
template<class CComplex>
double benchmark(int M, int N, int K, int BATCH)
{
+300
View File
@@ -0,0 +1,300 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MomentumProject.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiMomProject
Import vectors -> nxyz x (ncomponent x nt)
Import complex phases -> nmom x nxy
apply = via (possibly batched) GEMM
*/
template<class Field, class ComplexField>
class MomentumProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
GridBase *grid;
uint64_t nmom;
uint64_t nxyz;
uint64_t nt;
uint64_t nbtw;
uint64_t words;
deviceVector<scalar> BLAS_V; //
deviceVector<scalar> BLAS_M; //
deviceVector<scalar> BLAS_P; //
MomentumProject(){};
~MomentumProject(){ Deallocate(); };
void Deallocate(void)
{
grid=nullptr;
nmom=0;
nxyz=0;
nt=0;
nbtw=0;
words=0;
BLAS_V.resize(0);
BLAS_M.resize(0);
BLAS_P.resize(0);
}
void Allocate(int _nmom,GridBase *_grid)
{
grid=_grid;
Coordinate ldims = grid->LocalDimensions();
nmom=_nmom;
nt = ldims[grid->Nd()-1];
nxyz = grid->lSites()/nt;
words = sizeof(scalar_object)/sizeof(scalar);
nbtw = nt * words;
BLAS_V.resize (nxyz * nt * words );
BLAS_M.resize (nmom * nxyz );
BLAS_P.resize (nmom * nt * words );
}
void ImportMomenta(const std::vector <ComplexField> &momenta)
{
GRID_ASSERT(momenta.size()==nmom);
// might as well just make the momenta here
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_M.size();
GRID_ASSERT(momenta.size()==nmom)
GRID_ASSERT(momenta[0].Grid()==grid);
GRID_ASSERT(sz = nxyz * nmom);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords = words; // local variable for copy in to GPU
int64_t Nxyz = nxyz;
auto blasData_p = &BLAS_M[0];
for(int m=0;m<momenta.size();m++){
autoView( Data , momenta[m], AcceleratorRead);
auto Data_p = &Data[0];
accelerator_for(xyz,nxyz,1,{
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
Coordinate icoor(nd);
Coordinate ocoor(nd);
for (int d = 0; d < nd; d++) {
icoor[d] = lcoor[d]/rdimensions[d];
ocoor[d] = lcoor[d]%rdimensions[d];
}
int64_t osite;
int64_t isite;
Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
Lexicographic::IndexFromCoor(icoor,isite,simd);
// BLAS_M[nmom][slice_vol]
// Fortran Column major BLAS layout is M_xyz,mom
scalar data = extractLane(isite,Data[osite]);
uint64_t idx = xyz+m*Nxyz;
blasData_p[idx] = data;
});
}
}
void ImportVector(Field &vec)
{
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_V.size();
GRID_ASSERT(sz = nxyz * words * nt);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims= grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords= words; // local variable for copy in to GPU
auto blasData_p = &BLAS_V[0];
autoView( Data , vec, AcceleratorRead);
auto Data_p = &Data[0];
int64_t nwords = words;// for capture
int64_t Nt = nt;// for capture
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Coordinate icoor(nd);
Coordinate ocoor(nd);
Lexicographic::CoorFromIndex(icoor,lane,simd);
Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
int64_t l_xyz = 0;
for (int d = 0; d < nd; d++) {
lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
}
uint64_t l_t = lcoor[nd-1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd-1] =0;
Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
scalar_object data = extractLane(lane,Data[sf]);
scalar *data_words = (scalar *) &data;
for(int w = 0 ; w < nwords; w++) {
// BLAS_V[slice_vol][nt][words]
// Fortran Column major BLAS layout is V_(t,w)_xyz
uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
blasData_p[idx] = data_words[w];
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
{
projection.resize(nmom*nt);
acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
// Could decide on a layout late?
}
// Row major layout "C" order:
// BLAS_V[slice_vol][nt][words]
// BLAS_M[nmom][slice_vol]
// BLAS_P[nmom][nt][words]
//
// Fortran Column major BLAS layout is V_(w,t)_xyz
// Fortran Column major BLAS layout is M_xyz,mom
// Fortran Column major BLAS layout is P_(w,t),mom
//
// Projected
//
// P = (V * M)_(w,t),mom
//
void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
{
double t_import=0;
double t_export=0;
double t_gemm =0;
double t_allreduce=0;
t_import-=usecond();
this->ImportVector(data);
std::vector< typename Field::scalar_object > projected_planes;
deviceVector<scalar *> Vd(1);
deviceVector<scalar *> Md(1);
deviceVector<scalar *> Pd(1);
scalar * Vh = & BLAS_V[0];
scalar * Mh = & BLAS_M[0];
scalar * Ph = & BLAS_P[0];
acceleratorPut(Vd[0],Vh);
acceleratorPut(Md[0],Mh);
acceleratorPut(Pd[0],Ph);
t_import+=usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// P_im = VMmx . Vxi
/////////////////////////////////////////
t_gemm-=usecond();
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
words*nt,nmom,nxyz,
scalar(1.0),
Vd,
Md,
scalar(0.0), // wipe out result
Pd);
BLAS.synchronise();
t_gemm+=usecond();
t_export-=usecond();
ExportMomentumProjection(projected_planes); // resizes
t_export+=usecond();
/////////////////////////////////
// Reduce across MPI ranks
/////////////////////////////////
int nd = grid->Nd();
int gt = grid->GlobalDimensions()[nd-1];
int lt = grid->LocalDimensions()[nd-1];
projected_gdata.resize(gt*nmom);
for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
projected_gdata[t]=Zero();
}
for(int t=0;t<lt;t++){
for(int m=0;m<nmom;m++){
int st = grid->LocalStarts()[nd-1];
projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
}}
t_allreduce-=usecond();
grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
t_allreduce+=usecond();
std::cout << GridLogPerformance<<" MomentumProject t_import "<<t_import<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_export "<<t_export<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_gemm "<<t_gemm<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_reduce "<<t_allreduce<<"us"<<std::endl;
}
};
NAMESPACE_END(Grid);
+6 -7
View File
@@ -69,8 +69,8 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
GRID_ASSERT(evec.size()==eval.size());
GRID_ASSERT(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) {
@@ -141,11 +141,10 @@ public:
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
for (int j=0;j<Nsrc;j++) {
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
@@ -160,7 +160,7 @@ public:
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
@@ -259,7 +259,7 @@ public:
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
@@ -267,7 +267,7 @@ public:
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
@@ -131,12 +131,12 @@ public:
typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid);
GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
@@ -164,7 +164,7 @@ public:
const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
@@ -198,7 +198,7 @@ public:
+ v*bv
+ sb;
// assert(site*lwords<sz);
// GRID_ASSERT(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
@@ -219,12 +219,12 @@ public:
int nvec = vecs.size();
assert(vecs[0].Grid()==fine_grid);
GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
@@ -299,7 +299,7 @@ public:
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
@@ -320,7 +320,7 @@ public:
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT
@@ -353,7 +353,7 @@ public:
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
@@ -375,7 +375,7 @@ public:
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
@@ -409,7 +409,7 @@ public:
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis);
GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -464,7 +464,7 @@ public:
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
assert(nbasis==_nbasis);
GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -98,7 +98,7 @@ public:
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size());
GRID_ASSERT(ev<eval.size());
eval[ev] = _eval;
int64_t offset = ev*vol*words;
@@ -113,7 +113,7 @@ public:
// Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{
assert(_ev0+_nev<=evec.size());
GRID_ASSERT(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid());
@@ -126,8 +126,8 @@ public:
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{
int nrhs = source.size();
assert(source.size()==guess.size());
assert(grid == guess[0].Grid());
GRID_ASSERT(source.size()==guess.size());
GRID_ASSERT(grid == guess[0].Grid());
conformable(guess[0],source[0]);
int64_t vw = vol * words;
@@ -189,7 +189,7 @@ public:
Cd);
BLAS.synchronise();
assert(BLAS_C.size()==nev*nrhs);
GRID_ASSERT(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
+1 -1
View File
@@ -270,7 +270,7 @@ class TwoLevelCG : public LinearFunction<Field>
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
GRID_ASSERT(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
+5 -5
View File
@@ -92,8 +92,8 @@ class TwoLevelCGmrhs
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
SolveSingleSystem(src,x);
// SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -161,7 +161,7 @@ class TwoLevelCGmrhs
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
}
///////////////////////////
@@ -382,7 +382,7 @@ class TwoLevelCGmrhs
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
assert(0);
GRID_ASSERT(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
@@ -415,7 +415,7 @@ class TwoLevelCGmrhs
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
GRID_ASSERT(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
+4 -4
View File
@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
Linop.Op(psi, v);
b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k;
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ assert(0); }
if(ErrorOnNoConverge){ GRID_ASSERT(0); }
IterationsToComplete = k;
}
};
@@ -98,7 +98,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
int Nblock;
BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
@@ -201,7 +201,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
} else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi);
} else {
assert(0);
GRID_ASSERT(0);
}
}
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
@@ -209,7 +209,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi);
} else {
assert(0);
GRID_ASSERT(0);
}
}
@@ -259,10 +259,10 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -402,7 +402,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) assert(0);
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
@@ -438,10 +438,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
@@ -540,7 +540,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
}
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k;
}
@@ -554,7 +554,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{
Nblock = B.size();
assert(Nblock == X.size());
GRID_ASSERT(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
@@ -594,10 +594,10 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -731,7 +731,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k;
}
@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge,
// defaults to true
RealD Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
}
}
assert(0); // Never reached
GRID_ASSERT(0); // Never reached
return cp;
}
@@ -45,7 +45,7 @@ public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
@@ -94,7 +94,7 @@ public:
ssq = norm2(src);
RealD guess = norm2(psi);
NormTimer.Stop();
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
@@ -222,7 +222,7 @@ public:
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
TrueResidual = true_residual;
@@ -251,7 +251,7 @@ public:
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0);
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k;
}
@@ -77,7 +77,7 @@ public:
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
GRID_ASSERT(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
@@ -98,9 +98,9 @@ public:
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
GRID_ASSERT(psi.size()==nshift);
GRID_ASSERT(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift);
// remove dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
@@ -122,7 +122,7 @@ public:
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0;
}
@@ -338,7 +338,7 @@ public:
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
// GRID_ASSERT(0);
}
};
@@ -118,9 +118,9 @@ public:
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
GRID_ASSERT(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
@@ -141,7 +141,7 @@ public:
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0;
}
@@ -179,7 +179,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
// GRID_ASSERT(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
GRID_ASSERT(0);
}
};
@@ -48,12 +48,12 @@ public:
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void Op (const Field &in, Field &out){ GRID_ASSERT(0); }
void AdjOp (const Field &in, Field &out){ GRID_ASSERT(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
@@ -151,9 +151,9 @@ public:
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
GRID_ASSERT(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
@@ -174,7 +174,7 @@ public:
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0;
}
@@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0);
GRID_ASSERT(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
GRID_ASSERT(0);
}
};
@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
DoFinalCleanup(true),
Linop_fallback(NULL)
{
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
@@ -217,7 +217,7 @@ public:
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
@@ -263,7 +263,7 @@ public:
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
@@ -106,7 +106,7 @@ public:
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
GRID_ASSERT(0);
}
};
NAMESPACE_END(Grid);
@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge,
// defaults to true
RealD Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
}
}
assert(0); // Never reached
GRID_ASSERT(0); // Never reached
return cp;
}
@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge,
// defaults to true
RealD Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
}
}
assert(0); // Never reached
GRID_ASSERT(0); // Never reached
return cp;
}
@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge,
// defaults to true
RealD Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
}
}
assert(0); // Never reached
GRID_ASSERT(0); // Never reached
return cp;
}
@@ -175,7 +175,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),split_test(0),
Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
{ GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
////////////////////////////////
// Helpers
@@ -206,7 +206,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
}
}
assert(normalize(w,if_print) != 0);
GRID_ASSERT(normalize(w,if_print) != 0);
}
void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{
@@ -225,7 +225,7 @@ public:
w[i] = w[i] - ip * evec[j];
}}
for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0);
GRID_ASSERT(normalize(w[i],if_print) !=0);
}
@@ -244,7 +244,7 @@ public:
const uint64_t sites = grid->lSites();
int Nbatch = R/Nevec_acc;
assert( R%Nevec_acc == 0 );
GRID_ASSERT( R%Nevec_acc == 0 );
// Glog << "nBatch, Nevec_acc, R, Nu = "
// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
@@ -302,7 +302,7 @@ public:
}
}
for (int i=0; i<Nu; ++i) {
assert(normalize(w[i],do_print)!=0);
GRID_ASSERT(normalize(w[i],do_print)!=0);
}
Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{
std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid());
assert( Nu = src.size() );
GRID_ASSERT(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
}
Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size());
GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{
std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid());
assert( Nu = src.size() );
GRID_ASSERT(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
}
Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size());
GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -785,7 +785,7 @@ private:
int Nu = w.size();
int Nm = evec.size();
assert( b < Nm/Nu );
GRID_ASSERT( b < Nm/Nu );
// GridCartesian *grid = evec[0]._grid;
// converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
Glog << "Using split grid"<< std::endl;
// LatticeGaugeField s_Umu(SGrid);
assert((Nu%mrhs)==0);
GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid);
Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u])));
GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) {
Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
}
@@ -929,8 +929,8 @@ if(split_test){
Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid)
{
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
GridBase *grid)
{
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif
} else {
assert(0);
GRID_ASSERT(0);
}
}
@@ -1131,8 +1131,8 @@ if (1){
Eigen::MatrixXcd& M)
{
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange
@@ -1159,8 +1159,8 @@ if (1){
Eigen::MatrixXcd& M)
{
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
// rearrange
for ( int u=0; u<Nu; ++u ) {
@@ -121,7 +121,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),
Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
{ GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
////////////////////////////////
// Helpers
@@ -151,7 +151,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
}
}
assert(normalize(w,if_print) != 0);
GRID_ASSERT(normalize(w,if_print) != 0);
}
void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{
@@ -169,7 +169,7 @@ public:
w[i] = w[i] - ip * evec[j];
}}
for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0);
GRID_ASSERT(normalize(w[i],if_print) !=0);
}
void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
@@ -205,8 +205,8 @@ public:
{
std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid());
assert( Nu = src.size() );
GRID_ASSERT(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -227,7 +227,7 @@ public:
}
Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size());
GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -413,8 +413,8 @@ public:
{
std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid());
assert( Nu = src.size() );
GRID_ASSERT(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -441,7 +441,7 @@ public:
}
Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size());
GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -622,7 +622,7 @@ private:
int Nu = w.size();
int Nm = evec.size();
assert( b < Nm/Nu );
GRID_ASSERT( b < Nm/Nu );
// converts block index to full indicies for an interval [L,R)
int L = Nu*b;
@@ -630,7 +630,7 @@ private:
Real beta;
assert((Nu%mrhs)==0);
GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid);
std::vector<Field> out(mrhs,f_grid);
@@ -711,7 +711,7 @@ private:
for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u])));
GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) {
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
}
@@ -734,8 +734,8 @@ private:
Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid)
{
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) {
@@ -775,8 +775,8 @@ private:
GridBase *grid)
{
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) {
@@ -924,7 +924,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif
} else {
assert(0);
GRID_ASSERT(0);
}
}
@@ -936,8 +936,8 @@ if (1){
Eigen::MatrixXcd& M)
{
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange
@@ -964,8 +964,8 @@ if (1){
Eigen::MatrixXcd& M)
{
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm );
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm );
// rearrange
for ( int u=0; u<Nu; ++u ) {
@@ -211,7 +211,7 @@ until convergence
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src.Grid();
assert(grid == evec[0].Grid());
GRID_ASSERT(grid == evec[0].Grid());
// GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -231,7 +231,7 @@ until convergence
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size());
GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
@@ -337,7 +337,7 @@ until convergence
}
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
GRID_ASSERT(k2<Nm); GRID_ASSERT(k2<Nm); GRID_ASSERT(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -463,7 +463,7 @@ until convergence
{
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20;
assert( k< Nm );
GRID_ASSERT( k< Nm );
GridStopWatch gsw_op,gsw_o;
@@ -597,7 +597,7 @@ until convergence
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else {
assert(0);
GRID_ASSERT(0);
}
}
@@ -687,7 +687,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
}
}
#else
assert(0);
GRID_ASSERT(0);
#endif
}
@@ -80,7 +80,7 @@ public:
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
GRID_ASSERT(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
GRID_ASSERT(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
@@ -359,8 +359,8 @@ public:
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
GRID_ASSERT(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
assert(nbasis<=Nm);
GRID_ASSERT(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp);
@@ -400,8 +400,8 @@ public:
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
GRID_ASSERT(Nstop>=nbasis);
GRID_ASSERT(Nconv>=nbasis);
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
@@ -433,7 +433,7 @@ public:
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop);
GRID_ASSERT(Nconv>=Nstop);
evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){
+4 -4
View File
@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0);
GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
<< std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
IterationsToComplete = k;
}
@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge,
// defaults to true
RealD Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
GRID_ASSERT(0);
}
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
}
}
assert(0); // Never reached
GRID_ASSERT(0); // Never reached
return cp;
}
@@ -112,7 +112,7 @@ public:
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
GRID_ASSERT(0);
}
};
NAMESPACE_END(Grid);
@@ -118,7 +118,7 @@ public:
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
// GRID_ASSERT(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
GRID_ASSERT(0); // never reached
return cp;
}
};
@@ -113,7 +113,7 @@ public:
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
// GRID_ASSERT(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -224,7 +224,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -234,7 +234,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
GRID_ASSERT(0); // never reached
return cp;
}
};
@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
LinOp.Op(x,r); r = b - r;
assert(normb> 0.0);
GRID_ASSERT(normb> 0.0);
resid = norm2(r)/normb;
if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests
assert( rho != 0.0);
assert( xi != 0.0);
GRID_ASSERT( rho != 0.0);
GRID_ASSERT( xi != 0.0);
v = (1. / rho) * v_tld;
y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit
assert(abs(ep)>0);
GRID_ASSERT(abs(ep)>0);
beta = ep / delta;
assert(abs(beta)>0);
GRID_ASSERT(abs(beta)>0);
v_tld = p_tld - beta * v;
y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0);
GRID_ASSERT(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
}
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
}
assert(0);
GRID_ASSERT(0);
return; // no convergence
}
#else
+54 -54
View File
@@ -327,9 +327,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
}
@@ -347,17 +347,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@@ -396,13 +396,13 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; GRID_ASSERT( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@@ -461,9 +461,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even );
_Matrix.Meooe(sol_o, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; GRID_ASSERT( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd );
}
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); GRID_ASSERT(sol_o.Checkerboard() == Odd);
}
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -539,13 +539,13 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
_Matrix.MooeeInv(Mtmp,tmp); GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -560,12 +560,12 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -612,12 +612,12 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -638,12 +638,12 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
_Matrix.Meooe(sol_o_i,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -684,9 +684,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -707,12 +707,12 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even );
tmp = src_e - tmp; assert( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even );
_Matrix.Meooe(sol_o_i, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
tmp = src_e - tmp; GRID_ASSERT( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() == Odd );
};
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+10 -7
View File
@@ -97,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-3,400,false);
ConjugateGradient<FineField> CG(1.0e-4,2000,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@@ -131,7 +131,10 @@ public:
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,30,30);
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,12,12);
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,10,10);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
@@ -146,16 +149,16 @@ public:
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
for(int i=0;i<3;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
if (i==0)std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
if (i==0)std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
@@ -167,7 +170,7 @@ public:
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<" <f|OpDagOp|f>"<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
@@ -292,7 +295,7 @@ public:
}
}
assert(b==nn);
GRID_ASSERT(b==nn);
}
+6 -6
View File
@@ -309,7 +309,7 @@ public:
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl;
assert(0);
GRID_ASSERT(0);
}
for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p);
@@ -373,7 +373,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
GRID_ASSERT(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -383,7 +383,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
GRID_ASSERT(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -391,7 +391,7 @@ public:
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) {
@@ -406,7 +406,7 @@ public:
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag);
}
assert(Aself != nullptr);
GRID_ASSERT(Aself != nullptr);
}
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -697,7 +697,7 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask;
assert(self_stencil!=-1);
GRID_ASSERT(self_stencil!=-1);
for(int i=0;i<nbasis;i++){
@@ -99,7 +99,7 @@ public:
}
}
}
assert(nfound==geom.npoint);
GRID_ASSERT(nfound==geom.npoint);
ExchangeCoarseLinks();
}
*/
@@ -124,7 +124,7 @@ public:
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
assert(hermitian);
GRID_ASSERT(hermitian);
Mult(_A,in,out);
// if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out);
@@ -619,7 +619,7 @@ public:
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
}
}
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
@@ -80,12 +80,12 @@ public:
// Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
GRID_ASSERT(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]);
}
void GetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
GRID_ASSERT(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]);
}
void CopyMatrix (GeneralCoarseOp &_Op)
@@ -178,14 +178,14 @@ public:
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
assert(nbr<BLAS_B.size());
GRID_ASSERT(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
}
j++;
}
}
assert(j==unpadded_sites);
GRID_ASSERT(j==unpadded_sites);
}
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{
@@ -194,7 +194,7 @@ public:
typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid();
assert(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension;
to.resize(Fg->lSites());
@@ -241,10 +241,10 @@ public:
typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid();
assert(!Tg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension;
assert(in.size()==Tg->lSites());
GRID_ASSERT(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1;
@@ -669,7 +669,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
assert(nrhs>=1);
GRID_ASSERT(nrhs>=1);
RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded
@@ -721,7 +721,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
+3 -3
View File
@@ -67,8 +67,8 @@ public:
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
GRID_ASSERT(disp == -1 || disp == 0 || disp == 1);
GRID_ASSERT(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
@@ -131,7 +131,7 @@ public:
return p;
}
}
assert(0);
GRID_ASSERT(0);
return -1;
}
void BuildShifts(void)
+3 -3
View File
@@ -57,7 +57,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -106,7 +106,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -154,7 +154,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
+2 -2
View File
@@ -292,7 +292,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
GRID_ASSERT(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
@@ -345,7 +345,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
GRID_ASSERT(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+56 -56
View File
@@ -50,12 +50,12 @@ int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
auto count = AccViewTable.count(CpuPtr); GRID_ASSERT((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
GRID_ASSERT(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
@@ -69,9 +69,9 @@ void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
GRID_ASSERT(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
GRID_ASSERT(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -81,7 +81,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
GRID_ASSERT(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
@@ -94,7 +94,7 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
GRID_ASSERT(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
@@ -108,12 +108,12 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
GRID_ASSERT(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.accLock==0);
GRID_ASSERT(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
@@ -138,7 +138,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
GRID_ASSERT(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
@@ -162,11 +162,11 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.state==AccDirty);
GRID_ASSERT(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0);
GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
@@ -175,10 +175,10 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.state==CpuDirty);
GRID_ASSERT(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
@@ -194,10 +194,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.state!=Empty);
GRID_ASSERT(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
@@ -216,7 +216,7 @@ void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
GRID_ASSERT(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
@@ -228,7 +228,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
GRID_ASSERT(0);
return NULL;
}
}
@@ -237,10 +237,10 @@ void MemoryManager::EvictVictims(uint64_t bytes)
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
GRID_ASSERT(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
GRID_ASSERT(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
@@ -264,9 +264,9 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
GRID_ASSERT(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
@@ -275,8 +275,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
(uint64_t)AccCache.bytes,
(uint64_t)bytes,
(uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
@@ -293,7 +293,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
GRID_ASSERT(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
@@ -338,10 +338,10 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
GRID_ASSERT(0);
}
assert(AccCache.accLock>0);
GRID_ASSERT(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
@@ -362,8 +362,8 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
GRID_ASSERT(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
@@ -379,8 +379,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.cpuLock>0);
GRID_ASSERT(AccCache.accLock==0);
AccCache.cpuLock--;
}
@@ -413,12 +413,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite));
GRID_ASSERT(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
@@ -433,20 +433,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
GRID_ASSERT(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
@@ -528,12 +528,12 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
GRID_ASSERT(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
GRID_ASSERT(AccCache.LRU_valid==1);
GRID_ASSERT(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
@@ -552,7 +552,7 @@ void MemoryManager::Audit(std::string s)
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
GRID_ASSERT(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -561,16 +561,16 @@ void MemoryManager::Audit(std::string s)
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
}
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
GRID_ASSERT( AccCache.cpuLock== 0 ) ;
GRID_ASSERT( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
GRID_ASSERT(LruBytes1==LruBytes2);
GRID_ASSERT(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
GRID_ASSERT(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
GRID_ASSERT(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
+3 -3
View File
@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
GRID_ASSERT(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
GRID_ASSERT(ret == offset);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
GRID_ASSERT(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
+4 -4
View File
@@ -165,7 +165,7 @@ public:
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
@@ -187,7 +187,7 @@ public:
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
@@ -216,11 +216,11 @@ public:
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites());
GRID_ASSERT(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites());
GRID_ASSERT(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
+2 -2
View File
@@ -128,10 +128,10 @@ public:
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+9 -9
View File
@@ -67,7 +67,7 @@ public:
}
virtual int CheckerBoard(const Coordinate &site){
int linear=0;
assert(site.size()==_ndimension);
GRID_ASSERT(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
@@ -160,11 +160,11 @@ public:
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
GRID_ASSERT(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
GRID_ASSERT(checker_dim_mask.size() == _ndimension);
GRID_ASSERT(processor_grid.size() == _ndimension);
GRID_ASSERT(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
@@ -190,20 +190,20 @@ public:
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
GRID_ASSERT((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0);
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
GRID_ASSERT(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+13 -13
View File
@@ -108,7 +108,7 @@ public:
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
static void BroadcastWorld(int root,void* data, uint64_t bytes);
static void BarrierWorld(void);
////////////////////////////////////////////////////////////
@@ -149,7 +149,7 @@ public:
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
@@ -175,27 +175,27 @@ public:
int dest,
void *recv,
int from,
int bytes,int dir);
uint64_t bytes,int dir);
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
uint64_t bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int bytes,int dir);
uint64_t bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
uint64_t xbytes,uint64_t rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
@@ -206,7 +206,7 @@ public:
int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
uint64_t xbytes,uint64_t rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -220,20 +220,20 @@ public:
////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
void Broadcast(int root,void* data, uint64_t bytes);
////////////////////////////////////////////////////////////
// All2All down one dimension
////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
assert(in.size()==out.size());
GRID_ASSERT(dim>=0);
GRID_ASSERT(dim<_ndimension);
GRID_ASSERT(in.size()==out.size());
int numnode = _processors[dim];
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<31));
GRID_ASSERT(numnode * words == in.size());
GRID_ASSERT(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
+138 -80
View File
@@ -28,10 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
extern void * Grid_backtrace_buffer[_NBACKTRACE];
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
uint64_t checksum_index = 1;
#endif
////////////////////////////////////////////
// First initialise of comms system
@@ -56,11 +63,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0);
GRID_ASSERT(0);
}
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
GRID_ASSERT(0);
}
}
@@ -81,20 +88,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
GRID_ASSERT(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -121,8 +128,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
@@ -146,7 +153,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
GRID_ASSERT (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent
@@ -172,12 +179,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// Split the communicator
////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
GRID_ASSERT(ierr==0);
} else {
srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -202,7 +209,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
}
}
for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] );
GRID_ASSERT(_processor_coor[d] == ccoor[d] );
}
}
@@ -244,7 +251,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
assert(Size==_Nprocessors);
GRID_ASSERT(Size==_Nprocessors);
}
CartesianCommunicator::~CartesianCommunicator()
@@ -272,62 +279,62 @@ void CartesianCommunicator::GlobalSum(double &d)
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
@@ -335,24 +342,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
int dest,
void *recv,
int from,
int bytes,int dir)
uint64_t bytes,int dir)
{
MPI_Request xrq;
MPI_Request rrq;
assert(dest != _processor);
assert(from != _processor);
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
int tag;
tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
assert(ierr==0);
int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
GRID_ASSERT(ierr==0);
list.push_back(rrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
assert(ierr==0);
ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
GRID_ASSERT(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
@@ -363,7 +369,7 @@ void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
GRID_ASSERT(ierr==0);
list.resize(0);
}
@@ -372,7 +378,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
uint64_t bytes)
{
std::vector<MpiCommsRequest_t> reqs(0);
@@ -380,15 +386,15 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int ierr;
// Enforce no UVM in comms, device or host OK
assert(acceleratorIsCommunicable(xmit));
assert(acceleratorIsCommunicable(recv));
GRID_ASSERT(acceleratorIsCommunicable(xmit));
GRID_ASSERT(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
// Basic Halo comms primitive
@@ -396,7 +402,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox,
void *recv,
int from, int dor,
int bytes,int dir)
uint64_t bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
@@ -419,7 +425,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
uint64_t xbytes,uint64_t rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
@@ -428,7 +434,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
uint64_t xbytes,uint64_t rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
@@ -441,9 +447,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
@@ -451,15 +457,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
GRID_ASSERT(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
@@ -469,14 +475,14 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
GRID_ASSERT(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
@@ -493,7 +499,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
GRID_ASSERT(ierr==0);
list.resize(0);
this->StencilBarrier();
}
@@ -536,7 +542,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
uint64_t xbytes,uint64_t rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
@@ -553,9 +559,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
@@ -568,18 +574,24 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
* - post device - host send buffer transfer asynch
*/
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq);
off_node_bytes+=rbytes;
}
@@ -593,10 +605,19 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
GRID_ASSERT(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// GRID_ASSERT(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
@@ -635,7 +656,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
@@ -660,7 +685,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
uint64_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
@@ -671,8 +696,8 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
@@ -692,7 +717,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
uint64_t xbytes,uint64_t rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
@@ -705,9 +730,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
@@ -728,7 +753,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
@@ -751,7 +776,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
@@ -790,7 +815,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
@@ -798,7 +823,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
@@ -823,17 +881,17 @@ void CartesianCommunicator::Barrier(void)
{
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
{
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data,
bytes,
(int)bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
@@ -843,23 +901,23 @@ int CartesianCommunicator::RankWorld(void){
void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
{
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data,
bytes,
(int)bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
GRID_ASSERT(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
@@ -880,8 +938,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
GRID_ASSERT(words == iwords); // safe to cast to int ?
GRID_ASSERT(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
+17 -14
View File
@@ -27,6 +27,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
void GridAbort(void) { abort(); }
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -34,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char *** arv)
{
GlobalSharedMemory::Init(communicator_world);
@@ -54,14 +57,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_shm_processors = Coordinate(processors.size(),1);
_processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1);
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
_processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake
_Nprocessors=1;
_processor = 0;
for(int d=0;d<_ndimension;d++) {
assert(_processors[d]==1);
GRID_ASSERT(_processors[d]==1);
_processor_coor[d] = 0;
}
SetCommunicator(communicator_world);
@@ -87,19 +90,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
uint64_t bytes)
{
assert(0);
GRID_ASSERT(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
uint64_t bytes,int dir)
{
assert(0);
GRID_ASSERT(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
@@ -113,8 +116,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
@@ -130,7 +133,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int bytes, int dir)
uint64_t bytes, int dir)
{
return 2.0*bytes;
}
@@ -141,16 +144,16 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
uint64_t xbytes,uint64_t rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
void *xmit, void *xmit_comp,
int xmit_to_rank,int dox,
void *recv,
void *recv, void *recv_comp,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
uint64_t xbytes,uint64_t rbytes, int dir)
{
return xbytes+rbytes;
}
+7 -7
View File
@@ -58,8 +58,8 @@ int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void)
{
assert(_ShmAlloc);
assert(_ShmAllocBytes>0);
GRID_ASSERT(_ShmAlloc);
GRID_ASSERT(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
}
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
GRID_ASSERT(host_heap_bytes<host_heap_size);
}
return ptr;
}
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size);
GRID_ASSERT(heap_bytes<heap_size);
}
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
GRID_ASSERT(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
GRID_ASSERT(ShmSize == WorldShmSize);
return;
}
+40 -43
View File
@@ -43,10 +43,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif
#include <syscall.h>
#endif
@@ -71,7 +67,7 @@ public:
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
sock = socket(AF_UNIX, SOCK_DGRAM, 0); GRID_ASSERT(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
@@ -162,7 +158,7 @@ public:
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
GRID_ASSERT(_ShmSetup==0);
WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize);
@@ -188,7 +184,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
@@ -213,7 +209,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize);
GRID_ASSERT(g<WorldShmSize);
MyGroup[g++] = rank;
}
}
@@ -229,7 +225,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0);
GRID_ASSERT(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
@@ -250,7 +246,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNode=g;
}
}
assert(WorldNode!=-1);
GRID_ASSERT(WorldNode!=-1);
_ShmSetup=1;
}
// Gray encode support
@@ -292,7 +288,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
GRID_ASSERT(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
@@ -313,7 +309,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
GRID_ASSERT(nscan==3);
int nlo = N%9;
int nhi = N/9;
@@ -337,8 +333,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
GRID_ASSERT(hypercoor<WorldSize);
GRID_ASSERT(hypercoor>=0);
//////////////////////////////////////
// Printing
@@ -386,7 +382,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
GRID_ASSERT(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
@@ -405,7 +401,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
@@ -435,7 +431,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
// std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl;
GRID_ASSERT(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
@@ -451,7 +448,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
@@ -460,8 +457,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
@@ -522,8 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
@@ -632,7 +629,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
GRID_ASSERT(ierr==0);
}
///////////////////////////////////////////////////////////////
@@ -671,7 +668,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
}
assert(thisBuf!=nullptr);
GRID_ASSERT(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
@@ -712,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -743,9 +740,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
perror("failed mmap"); GRID_ASSERT(0);
}
assert(((uint64_t)ptr&0x3F)==0);
GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -760,8 +757,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -772,7 +769,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
GRID_ASSERT(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
@@ -786,9 +783,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
perror("failed mmap"); GRID_ASSERT(0);
}
assert(((uint64_t)ptr&0x3F)==0);
GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -807,8 +804,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
@@ -839,7 +836,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
@@ -860,8 +857,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); GRID_ASSERT(0); }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
@@ -918,7 +915,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1);
GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
@@ -986,9 +983,9 @@ void SharedMemory::SharedMemoryTest(void)
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
GRID_ASSERT(check[1]==r);
GRID_ASSERT(check[2]==magic);
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
@@ -1006,7 +1003,7 @@ void *SharedMemory::ShmBuffer(int rank)
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
GRID_ASSERT(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){
return NULL;
+6 -6
View File
@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
GRID_ASSERT(_ShmSetup==0);
WorldComm = 0;
WorldRank = 0;
WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
GRID_ASSERT(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0);
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -132,7 +132,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1);
ShmCommBufs.resize(1);
ShmRanks[0] = 0;
+5 -5
View File
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWrite);
autoView( rhs_v, rhs, AcceleratorWriteDiscard);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
autoView( rhs_v , rhs, AcceleratorWrite);
autoView( rhs_v , rhs, AcceleratorWriteDiscard);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
@@ -240,9 +240,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU
GRID_ASSERT(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
autoView(lhs_v , lhs, AcceleratorWriteDiscard);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
+142 -14
View File
@@ -29,8 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
@@ -45,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
if( shift ==0 ) {
ret = rhs;
return ret;
}
//
// Potential easy fast cases:
// Shift is a multiple of the local lattice extent.
// Then need only to shift whole subvolumes
int L = rhs.Grid()->_ldimensions[dimension];
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
Cshift_simple(ret,rhs,dimension,shift);
return ret;
}
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type
@@ -69,6 +87,55 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
return ret;
}
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
GridBase *grid=rhs.Grid();
int comm_proc, xmit_to_rank, recv_from_rank;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int ld = rhs.Grid()->_ldimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
comm_proc = ((shift)/ld)%pd;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_dim) {
int64_t bytes = sizeof(vobj) * grid->oSites();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(ret_v , ret, AcceleratorWrite);
void *send_buf = (void *)&rhs_v[0];
void *recv_buf = (void *)&ret_v[0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom(send_buf,
xmit_to_rank,
recv_buf,
recv_from_rank,
bytes);
#else
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
static hostVector<vobj> hret; hret.resize(grid->oSites());
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(send_buf,hsend_buf,bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
hrecv_buf,
recv_from_rank,
bytes);
acceleratorCopyToDevice(hrecv_buf,recv_buf,bytes);
#endif
}
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
@@ -117,17 +184,18 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
GRID_ASSERT(simd_layout==1);
GRID_ASSERT(comm_dim==1);
GRID_ASSERT(shift>=0);
GRID_ASSERT(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif
int cb= (cbmask==0x2)? Odd : Even;
@@ -180,12 +248,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
GRID_ASSERT(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
GRID_ASSERT(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
@@ -227,10 +322,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
GRID_ASSERT(comm_dim==1);
GRID_ASSERT(simd_layout==2);
GRID_ASSERT(shift>=0);
GRID_ASSERT(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
@@ -256,8 +351,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_buf_extract[s].resize(buffer_size);
}
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif
int bytes = buffer_size*sizeof(scalar_object);
@@ -301,7 +404,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
GRID_ASSERT (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
@@ -320,12 +423,37 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
assert(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif
xbytes+=bytes;
+1 -1
View File
@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{
if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.Checkerboard());
GRID_ASSERT(cb == lat.Checkerboard());
}
cb = lat.Checkerboard();
}
+13 -13
View File
@@ -120,12 +120,12 @@ public:
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
GRID_ASSERT(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
GRID_ASSERT(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
GRID_ASSERT(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
@@ -264,7 +264,7 @@ public:
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid;
resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0;
SetViewMode(mode);
}
+5 -5
View File
@@ -166,9 +166,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
GRID_ASSERT(vlen>=1);
GRID_ASSERT(vlen<=sort_vals.size());
GRID_ASSERT(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
@@ -186,7 +186,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
GRID_ASSERT(idx[i] > i); GRID_ASSERT(j!=idx.size()); GRID_ASSERT(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -224,7 +224,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
GRID_ASSERT(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
+2 -2
View File
@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard());
GRID_ASSERT(lhs.Grid() == rhs.Grid());
GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard());
}
NAMESPACE_END(Grid);
+3 -3
View File
@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
+9 -9
View File
@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
GRID_ASSERT(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
GRID_ASSERT(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
+28 -59
View File
@@ -292,26 +292,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
// Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// uint64_t csum=0;
// uint64_t csum2=0;
// if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
// {
// Hack
// Fast integer xor checksum. Can also be used in comms now.
// autoView(l_v,left,AcceleratorRead);
// Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
// uint64_t *base= (uint64_t *)&l_v[0];
// csum=svm_xor(base,words);
// ok = FlightRecorder::CsumLog(csum);
// if ( !ok ) {
// csum2=svm_xor(base,words);
// std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// } else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// }
// GRID_ASSERT(ok);
// }
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
@@ -322,7 +322,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
GRID_ASSERT(ok);
}
FlightRecorder::StepLog("Start global sum");
grid->GlobalSumP2P(nrm);
@@ -376,40 +376,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(z_v[ss],tmp);
});
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
GRID_ASSERT(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
@@ -495,13 +464,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
GRID_ASSERT(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
GRID_ASSERT(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
@@ -588,14 +557,14 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
assert(grid!=NULL);
GRID_ASSERT(grid!=NULL);
conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
GRID_ASSERT(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
+173 -23
View File
@@ -197,55 +197,205 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
#undef GRID_REDUCTION_TIMING
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
GRID_ASSERT(ok);
Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
deviceVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
#ifdef GRID_REDUCTION_TIMING
RealD t_kernel = -usecond();
#endif
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
#ifdef GRID_REDUCTION_TIMING
t_kernel += usecond();
RealD t_d2h = -usecond();
#endif
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#ifdef GRID_REDUCTION_TIMING
t_d2h += usecond();
std::cout << GridLogDebug << " sumD_gpu_small"
<< " sizeof(sobj)=" << sizeof(sobj)
<< " blocks=" << numBlocks << " threads=" << numThreads
<< " kernel+barrier=" << t_kernel << " us"
<< " D2H=" << t_d2h << " us" << std::endl;
#endif
return result;
}
// Fused pack+reduce: reads R words of each vobj at word offset 'base',
// accumulates directly into iVector<iScalar<scalarD>,R> without staging
// through an intermediate bundle buffer. One HBM pass instead of three.
template <int R, class vobj, class sobj, class Iterator>
__device__ void packReduceBlocks(
const iScalar<typename vobj::vector_type> *idat,
sobj *g_odata, Iterator osites, int base, int words)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
sobj *sdata = (sobj *)shmem_pointer;
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x * (blockSize * 2) + threadIdx.x;
Iterator gridSize = blockSize * 2 * gridDim.x;
sobj mySum = Zero();
while (i < osites * nsimd) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
sobj tmpD; zeroit(tmpD);
for (int k = 0; k < R; k++) {
auto w = extractLane(lane, idat[ss * words + base + k]);
iScalar<typename vobj::scalar_typeD> wd; wd = w;
tmpD._internal[k] = wd;
}
mySum += tmpD;
if (i + blockSize < osites * nsimd) {
lane = (i + blockSize) % nsimd;
ss = (i + blockSize) / nsimd;
sobj tmpD2; zeroit(tmpD2);
for (int k = 0; k < R; k++) {
auto w = extractLane(lane, idat[ss * words + base + k]);
iScalar<typename vobj::scalar_typeD> wd; wd = w;
tmpD2._internal[k] = wd;
}
mySum += tmpD2;
}
i += gridSize;
}
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <int R, class vobj, class sobj, class Iterator>
__global__ void packReduceKernel(
const iScalar<typename vobj::vector_type> *idat,
sobj *buffer, Iterator osites, int base, int words)
{
Iterator blockSize = blockDim.x;
packReduceBlocks<R, vobj, sobj>(idat, buffer, osites, base, words);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
sobj *smem = (sobj *)shmem_pointer;
acceleratorFence();
if (tid == 0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
amLast = (ticket == gridDim.x - 1);
}
acceleratorSynchroniseAll();
if (amLast) {
Iterator i = tid;
sobj mySum = Zero();
while (i < (Iterator)gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid == 0) {
buffer[0] = smem[0];
retirementCount = 0;
}
}
}
}
template<int R, class vobj>
inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
typename vobj::scalar_typeD *ret_p, int base)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
using BundleScalarD = iVector<iScalar<scalarD>, R>;
constexpr int Nsimd = vobj::Nsimd();
const int words = sizeof(vobj) / sizeof(vector);
const iScalar<vector> *idat = (const iScalar<vector> *)lat;
Integer size = (Integer)osites * Nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(BundleScalarD), numThreads, numBlocks);
GRID_ASSERT(ok);
Integer smemSize = numThreads * sizeof(BundleScalarD);
deviceVector<BundleScalarD> buffer(numBlocks);
BundleScalarD *buffer_v = &buffer[0];
BundleScalarD result;
#ifdef GRID_REDUCTION_TIMING
RealD t_kernel = -usecond();
#endif
packReduceKernel<R, vobj, BundleScalarD, Integer>
<<<numBlocks, numThreads, smemSize, computeStream>>>
(idat, buffer_v, osites, base, words);
accelerator_barrier();
#ifdef GRID_REDUCTION_TIMING
t_kernel += usecond();
RealD t_d2h = -usecond();
#endif
acceleratorCopyFromDevice(buffer_v, &result, sizeof(result));
#ifdef GRID_REDUCTION_TIMING
t_d2h += usecond();
std::cout << GridLogDebug << " sumD_gpu_reduce_words R=" << R
<< " base=" << base
<< " kernel=" << t_kernel << " D2H=" << t_d2h << " us" << std::endl;
#endif
for (int k = 0; k < R; k++)
ret_p[base + k] = TensorRemove(result._internal[k]);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
const int words = sizeof(vobj) / sizeof(vector);
sobjD ret; zeroit(ret);
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
#ifdef GRID_REDUCTION_TIMING
RealD t_large = -usecond();
#endif
int w = 0;
while (w + 12 <= words) { sumD_gpu_reduce_words<12>(lat, osites, ret_p, w); w += 12; }
while (w + 4 <= words) { sumD_gpu_reduce_words< 4>(lat, osites, ret_p, w); w += 4; }
while (w < words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w += 1; }
#ifdef GRID_REDUCTION_TIMING
t_large += usecond();
std::cout << GridLogDebug << "sumD_gpu_large"
<< " sizeof(sobjD)=" << sizeof(sobjD)
<< " words=" << words << " total=" << t_large << " us" << std::endl;
#endif
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
+34 -16
View File
@@ -6,28 +6,27 @@ NAMESPACE_BEGIN(Grid);
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj identity; zeroit(identity);
sobj ret; zeroit(ret);
Integer nsimd= vobj::Nsimd();
{
sycl::buffer<sobj, 1> abuff(&ret, {1});
sobjD identity; zeroit(identity);
sobjD ret; zeroit(ret);
{
sycl::buffer<sobjD, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(sycl::range<1>{osites},
Reduction,
[=] (sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
auto Reduction = sycl::reduction(abuff, cgh, identity, std::plus<>());
cgh.parallel_for(sycl::range<1>{(size_t)osites},
Reduction,
[=](sycl::id<1> item, auto &sum) {
sobj s = Reduce(lat[item[0]]);
sobjD sd; sd = s;
sum += sd;
});
});
}
sobjD dret; convertType(dret,ret);
return dret;
return ret;
}
template <class vobj>
@@ -87,6 +86,25 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait();
return ret;
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait();
return ret;
}
NAMESPACE_END(Grid);
+13 -13
View File
@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
GRID_ASSERT(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
GRID_ASSERT(fine->_simd_layout[d]==1);
GRID_ASSERT(fine->_processors[d]==1);
}
int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
GRID_ASSERT(coarse->_processors[d] == fine->_processors[fd]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
int lowerdims = fine->_ndimension - coarse->_ndimension; GRID_ASSERT(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
GRID_ASSERT(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
@@ -177,7 +177,7 @@ public:
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
GRID_ASSERT((skip >> shift)==site); // check for overflow
eng.discard(skip);
#else
@@ -218,7 +218,7 @@ public:
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
GRID_ASSERT(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
+9 -3
View File
@@ -1,7 +1,6 @@
#pragma once
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
#define gpucub cub
#define gpuError_t cudaError_t
@@ -57,8 +56,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
//copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
#define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
#else
#define GRID_CUB_SUM_OP ::gpucub::Sum()
#endif
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
@@ -82,11 +86,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
#undef GRID_CUB_SUM_OP
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
+64 -64
View File
@@ -31,15 +31,15 @@ NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine)
{
assert(coarse->_ndimension == fine->_ndimension);
GRID_ASSERT(coarse->_ndimension == fine->_ndimension);
int _ndimension = coarse->_ndimension;
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<_ndimension;d++){
assert(coarse->_processors[d] == fine->_processors[d]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
GRID_ASSERT(coarse->_processors[d] == fine->_processors[d]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]);
GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
}
@@ -309,7 +309,7 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GRID_ASSERT(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
@@ -344,7 +344,7 @@ template<class vobj,class vobj2,class CComplex>
GridBase * coarse= coarseA.Grid();
fineZ.Checkerboard()=fineX.Checkerboard();
assert(fineX.Checkerboard()==fineY.Checkerboard());
GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map
conformable(fineX,fineY);
conformable(fineX,fineZ);
@@ -356,7 +356,7 @@ template<class vobj,class vobj2,class CComplex>
// FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
autoView( fineZ_ , fineZ, AcceleratorWrite);
@@ -613,7 +613,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
int _ndimension = coarse->_ndimension;
// checks
assert( nbasis == Basis.size() );
GRID_ASSERT( nbasis == Basis.size() );
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine);
@@ -687,7 +687,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GRID_ASSERT(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
@@ -715,12 +715,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
int ni = ig->_ndimension;
int no = og->_ndimension;
assert(ni == no);
GRID_ASSERT(ni == no);
for(int d=0;d<no;d++){
assert(ig->_processors[d] == og->_processors[d]);
assert(ig->_ldimensions[d] == og->_ldimensions[d]);
assert(ig->lSites() == og->lSites());
GRID_ASSERT(ig->_processors[d] == og->_processors[d]);
GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]);
GRID_ASSERT(ig->lSites() == og->lSites());
}
autoView(in_v,in,CpuRead);
@@ -752,16 +752,16 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
GRID_ASSERT(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
int nd = nF;
assert(nF == nT);
GRID_ASSERT(nF == nT);
for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]);
GRID_ASSERT(Fg->_processors[d] == Tg->_processors[d]);
}
///////////////////////////////////////////////////////////
@@ -821,12 +821,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
GRID_ASSERT(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
assert(nF+1 == nT);
GRID_ASSERT(nF+1 == nT);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
@@ -890,12 +890,12 @@ void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, in
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
GRID_ASSERT(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
assert(nT+1 == nF);
GRID_ASSERT(nT+1 == nF);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
@@ -955,16 +955,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl+1 == nh);
assert(orthog<nh);
assert(orthog>=0);
assert(hg->_processors[orthog]==1);
GRID_ASSERT(nl+1 == nh);
GRID_ASSERT(orthog<nh);
GRID_ASSERT(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1);
int dl; dl = 0;
for(int d=0;d<nh;d++){
if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++;
}
}
@@ -1005,17 +1005,17 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl+1 == nh);
assert(orthog<nh);
assert(orthog>=0);
assert(hg->_processors[orthog]==1);
GRID_ASSERT(nl+1 == nh);
GRID_ASSERT(orthog<nh);
GRID_ASSERT(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0;
for(int d=0;d<nh;d++){
if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++;
}
}
@@ -1056,14 +1056,14 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl == nh);
assert(orthog<nh);
assert(orthog>=0);
GRID_ASSERT(nl == nh);
GRID_ASSERT(orthog<nh);
GRID_ASSERT(orthog>=0);
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
GRID_ASSERT(lg->_processors[d] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
Coordinate sz = lg->_ldimensions;
@@ -1093,7 +1093,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
subdivides(cg,fg);
assert(cg->_ndimension==fg->_ndimension);
GRID_ASSERT(cg->_ndimension==fg->_ndimension);
Coordinate ratio(cg->_ndimension);
@@ -1157,7 +1157,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
assert(lex < out.size());
GRID_ASSERT(lex < out.size());
out_ptrs[lane] = &out[lex];
}
@@ -1221,7 +1221,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype;
GridBase* grid = out.Grid();
assert(in.size()==grid->lSites());
GRID_ASSERT(in.size()==grid->lSites());
const int ndim = grid->Nd();
constexpr int nsimd = vtype::Nsimd();
@@ -1268,7 +1268,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid;
assert(in.size()==grid->lSites());
GRID_ASSERT(in.size()==grid->lSites());
int ndim = grid->Nd();
int nsimd = vtype::Nsimd();
@@ -1329,9 +1329,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
assert(out.Grid()->Nd() == in.Grid()->Nd());
GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
}
out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid();
@@ -1382,9 +1382,9 @@ class precisionChangeWorkspace{
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
GRID_ASSERT(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
@@ -1549,7 +1549,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size();
assert(full_vecs>=1);
GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid();
@@ -1567,18 +1567,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
//////////////////////////////
// Checks
//////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension);
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb);
GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
}
}
int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs);
GRID_ASSERT(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){
@@ -1622,7 +1622,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A
thread_for(c, chunk, {
@@ -1675,7 +1675,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size();
assert(full_vecs>=1);
GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid();
@@ -1693,18 +1693,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
//////////////////////////////
// Checks
//////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension);
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb);
GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
}
}
int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs);
GRID_ASSERT(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){
@@ -1740,7 +1740,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
{
// Loop over reordered data post A2A
+42
View File
@@ -106,6 +106,47 @@ public:
}
};
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister
template<class View>
class ViewCloser
@@ -119,6 +160,7 @@ class ViewCloser
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
+8 -8
View File
@@ -82,10 +82,10 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal
assert(rNsimda==rNsimd);
GRID_ASSERT(rNsimda==rNsimd);
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
// GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
@@ -172,7 +172,7 @@ template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
// GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
@@ -247,7 +247,7 @@ public:
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) assert(local[d]>=depth);
if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth);
}
}
void DeleteGrids(void)
@@ -448,9 +448,9 @@ public:
int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd();
assert(depth<=lds[dimension]); // A must be on neighbouring node
assert(depth>0); // A caller bug if zero
assert(ld+2*depth==nld);
GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node
GRID_ASSERT(depth>0); // A caller bug if zero
GRID_ASSERT(ld+2*depth==nld);
////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations
////////////////////////////////////////////////////////////////////////////
@@ -460,7 +460,7 @@ public:
}
buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static deviceVector<vobj> send_buf;
static deviceVector<vobj> recv_buf;
+3
View File
@@ -69,6 +69,7 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -84,6 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
GridLogHMC.Active(1);
@@ -97,6 +99,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
+1 -6
View File
@@ -33,10 +33,6 @@
#ifndef GRID_LOG_H
#define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////
@@ -180,6 +176,7 @@ extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug;
extern GridLogger GridLogComms;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative;
@@ -226,8 +223,6 @@ inline void Grid_pass(Args&&... args) {
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() { \
char string[20]; \
+19 -17
View File
@@ -293,9 +293,9 @@ class BinaryIO {
// Flatten the file
uint64_t lsites = grid->lSites();
if ( control & BINARYIO_MASTER_APPEND ) {
assert(iodata.size()==1);
GRID_ASSERT(iodata.size()==1);
} else {
assert(lsites==iodata.size());
GRID_ASSERT(lsites==iodata.size());
}
for(int d=0;d<ndim;d++){
gStart[d] = lLattice[d]*pcoor[d];
@@ -326,20 +326,20 @@ class BinaryIO {
// Sobj in MPI phrasing
//////////////////////////////////////////////////////////////////////////////
int ierr;
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); assert(ierr==0);
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); GRID_ASSERT(ierr==0);
ierr = MPI_Type_commit(&mpiObject);
//////////////////////////////////////////////////////////////////////////////
// File global array data type
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); assert(ierr==0);
ierr=MPI_Type_commit(&fileArray); assert(ierr==0);
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); GRID_ASSERT(ierr==0);
ierr=MPI_Type_commit(&fileArray); GRID_ASSERT(ierr==0);
//////////////////////////////////////////////////////////////////////////////
// local lattice array
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0);
ierr=MPI_Type_commit(&localArray); assert(ierr==0);
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); GRID_ASSERT(ierr==0);
ierr=MPI_Type_commit(&localArray); GRID_ASSERT(ierr==0);
#endif
//////////////////////////////////////////////////////////////////////////////
@@ -349,8 +349,8 @@ class BinaryIO {
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1);
GRID_ASSERT(ieee64||ieee32|ieee64big||ieee32big);
GRID_ASSERT((ieee64+ieee32+ieee64big+ieee32big)==1);
//////////////////////////////////////////////////////////////////////////////
// Do the I/O
//////////////////////////////////////////////////////////////////////////////
@@ -361,9 +361,9 @@ class BinaryIO {
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); GRID_ASSERT(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); GRID_ASSERT(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); GRID_ASSERT(ierr==0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
@@ -384,13 +384,14 @@ class BinaryIO {
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
GRID_ASSERT(fin.fail() == 0);
fin.close();
}
timer.Stop();
grid->Barrier();
timer.Stop();
bstimer.Start();
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
@@ -435,11 +436,11 @@ class BinaryIO {
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
GRID_ASSERT(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
GRID_ASSERT(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
@@ -506,6 +507,7 @@ class BinaryIO {
offset = fout.tellp();
fout.close();
}
grid->Barrier();
timer.Stop();
}
+37 -37
View File
@@ -289,7 +289,7 @@ class GridLimeReader : public BinaryIO {
return;
}
}
assert(0);
GRID_ASSERT(0);
}
////////////////////////////////////////////
// Read a generic serialisable object
@@ -314,7 +314,7 @@ class GridLimeReader : public BinaryIO {
}
}
assert(0);
GRID_ASSERT(0);
}
template<class serialisable_object>
@@ -348,7 +348,7 @@ class GridLimeWriter : public BinaryIO
filename= _filename;
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
LimeW = limeCreateWriter(File); GRID_ASSERT(LimeW != NULL );
}
}
/////////////////////////////////////////////
@@ -368,7 +368,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
GRID_ASSERT(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
@@ -386,11 +386,11 @@ class GridLimeWriter : public BinaryIO
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
GRID_ASSERT(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
err=limeWriteRecordHeader(h, LimeW); GRID_ASSERT(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); GRID_ASSERT(err>=0);
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
limeDestroyHeader(h);
}
}
@@ -431,7 +431,7 @@ class GridLimeWriter : public BinaryIO
////////////////////////////////////////////////////////////////////
GridBase *grid = field.Grid();
assert(boss_node == field.Grid()->IsBoss() );
GRID_ASSERT(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@@ -473,7 +473,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
GRID_ASSERT( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
@@ -481,7 +481,7 @@ class GridLimeWriter : public BinaryIO
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
@@ -621,8 +621,8 @@ class IldgWriter : public ScidacWriter {
uint64_t PayloadSize = LFN.size();
int err;
createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); GRID_ASSERT(err>=0);
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
}
////////////////////////////////////////////////////////////////
@@ -656,7 +656,7 @@ class IldgWriter : public ScidacWriter {
header.sequence_number = sequence;
header.ildg_lfn = LFN;
assert ( (format == std::string("IEEE32BIG"))
GRID_ASSERT ( (format == std::string("IEEE32BIG"))
||(format == std::string("IEEE64BIG")) );
//////////////////////////////////////////////////////
@@ -676,8 +676,8 @@ class IldgWriter : public ScidacWriter {
ildgfmt.ly = header.dimension[1];
ildgfmt.lz = header.dimension[2];
ildgfmt.lt = header.dimension[3];
assert(header.nd==4);
assert(header.nd==header.dimension.size());
GRID_ASSERT(header.nd==4);
GRID_ASSERT(header.nd==header.dimension.size());
//////////////////////////////////////////////////////////////////////////////
// Field norm tests
@@ -734,7 +734,7 @@ class IldgReader : public GridLimeReader {
Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4);
GRID_ASSERT(dims.size()==4);
// Metadata holders
ildgFormat ildgFormat_ ;
@@ -793,10 +793,10 @@ class IldgReader : public GridLimeReader {
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
assert( ildgFormat_.lx == dims[0]);
assert( ildgFormat_.ly == dims[1]);
assert( ildgFormat_.lz == dims[2]);
assert( ildgFormat_.lt == dims[3]);
GRID_ASSERT( ildgFormat_.lx == dims[0]);
GRID_ASSERT( ildgFormat_.ly == dims[1]);
GRID_ASSERT( ildgFormat_.lz == dims[2]);
GRID_ASSERT( ildgFormat_.lt == dims[3]);
found_ildgFormat = 1;
}
@@ -813,10 +813,10 @@ class IldgReader : public GridLimeReader {
format = FieldMetaData_.floating_point;
assert(FieldMetaData_.dimension[0] == dims[0]);
assert(FieldMetaData_.dimension[1] == dims[1]);
assert(FieldMetaData_.dimension[2] == dims[2]);
assert(FieldMetaData_.dimension[3] == dims[3]);
GRID_ASSERT(FieldMetaData_.dimension[0] == dims[0]);
GRID_ASSERT(FieldMetaData_.dimension[1] == dims[1]);
GRID_ASSERT(FieldMetaData_.dimension[2] == dims[2]);
GRID_ASSERT(FieldMetaData_.dimension[3] == dims[3]);
found_FieldMetaData = 1;
}
@@ -866,13 +866,13 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format
//////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary);
assert(found_ildgFormat);
assert(found_scidacChecksum);
GRID_ASSERT(found_ildgLFN);
GRID_ASSERT(found_ildgBinary);
GRID_ASSERT(found_ildgFormat);
GRID_ASSERT(found_scidacChecksum);
// Must find something with the lattice dimensions
assert(found_FieldMetaData||found_ildgFormat);
GRID_ASSERT(found_FieldMetaData||found_ildgFormat);
if ( found_FieldMetaData ) {
@@ -880,9 +880,9 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
GRID_ASSERT(found_ildgFormat);
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
GRID_ASSERT ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@@ -927,20 +927,20 @@ class IldgReader : public GridLimeReader {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
GRID_ASSERT( scidac_csuma ==FieldMetaData_.scidac_checksuma);
GRID_ASSERT( scidac_csumb ==FieldMetaData_.scidac_checksumb);
std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
} else {
std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
assert(0); // Can I insist always checksum ?
GRID_ASSERT(0); // Can I insist always checksum ?
}
if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker;
stats Stats;
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
GRID_ASSERT(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
GRID_ASSERT(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
}
}
+3 -3
View File
@@ -203,7 +203,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
assert( Nc < 4 && Nc > 1 ) ;
GRID_ASSERT( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
@@ -240,7 +240,7 @@ struct BinarySimpleUnmunger {
sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
GRID_ASSERT(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
@@ -259,7 +259,7 @@ struct BinarySimpleMunger {
sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
GRID_ASSERT(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
+16 -16
View File
@@ -76,7 +76,7 @@ public:
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER"));
GRID_ASSERT(line==std::string("BEGIN_HEADER"));
do {
getline(fin,line); // read one line
@@ -106,9 +106,9 @@ public:
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4);
GRID_ASSERT(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]);
GRID_ASSERT(grid->_fdimensions[d]==field.dimension[d]);
}
field.link_trace = std::stod(header["LINK_TRACE"]);
@@ -183,7 +183,7 @@ public:
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
GRID_ASSERT(0);
}
GaugeStats Stats; Stats(Umu,clone);
@@ -205,9 +205,9 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
if(exitOnReadPlaquetteMismatch()) GRID_ASSERT(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
GRID_ASSERT(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
GRID_ASSERT(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
@@ -246,7 +246,7 @@ public:
GridBase *grid = Umu.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
GRID_ASSERT(header.nd==4);
GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header);
@@ -302,7 +302,7 @@ public:
GridBase *grid = parallel.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
GRID_ASSERT(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
@@ -355,16 +355,16 @@ public:
std::string data_type(header.data_type);
#ifdef RNG_RANLUX
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
GRID_ASSERT(format == std::string("UINT64"));
GRID_ASSERT(data_type == std::string("RANLUX48"));
#endif
#ifdef RNG_MT19937
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
GRID_ASSERT(format == std::string("UINT32"));
GRID_ASSERT(data_type == std::string("MT19937"));
#endif
#ifdef RNG_SITMO
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
GRID_ASSERT(format == std::string("UINT64"));
GRID_ASSERT(data_type == std::string("SITMO"));
#endif
// depending on datatype, set up munger;
@@ -376,7 +376,7 @@ public:
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
GRID_ASSERT(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
+10 -10
View File
@@ -49,7 +49,7 @@ public:
{
std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
assert(!fin.fail());
GRID_ASSERT(!fin.fail());
field.data_start = fin.tellg();
fin.close();
}
@@ -57,10 +57,10 @@ public:
header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues)
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
GRID_ASSERT(0 < header.Nt && header.Nt <= 1024);
GRID_ASSERT(0 < header.Nx && header.Nx <= 1024);
GRID_ASSERT(0 < header.Ny && header.Ny <= 1024);
GRID_ASSERT(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
@@ -71,9 +71,9 @@ public:
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
assert(grid->_ndimension == Nd);
GRID_ASSERT(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++)
assert(grid->_fdimensions[d] == field.dimension[d]);
GRID_ASSERT(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq;
@@ -86,10 +86,10 @@ public:
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
assert(grid != nullptr); assert(grid->_ndimension == Nd);
GRID_ASSERT(grid != nullptr); GRID_ASSERT(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header);
@@ -171,7 +171,7 @@ public:
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
GRID_ASSERT(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
+22 -22
View File
@@ -62,7 +62,7 @@ public:
: swap(false)
, grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
assert(err == MPI_SUCCESS);
GRID_ASSERT(err == MPI_SUCCESS);
}
virtual ~ParRdr() { MPI_File_close(&fp); }
@@ -76,8 +76,8 @@ public:
}
int readHeader(FieldMetaData& field) {
assert((grid->_ndimension == Nd) && (Nd == 4));
assert(Nc == 3);
GRID_ASSERT((grid->_ndimension == Nd) && (Nd == 4));
GRID_ASSERT(Nc == 3);
OpenQcdHeader header;
@@ -86,10 +86,10 @@ public:
header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove?
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
GRID_ASSERT(0 < header.Nt && header.Nt <= 1024);
GRID_ASSERT(0 < header.Nx && header.Nx <= 1024);
GRID_ASSERT(0 < header.Ny && header.Ny <= 1024);
GRID_ASSERT(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
@@ -97,7 +97,7 @@ public:
field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++)
assert(grid->FullDimensions()[d] == field.dimension[d]);
GRID_ASSERT(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq;
@@ -114,15 +114,15 @@ public:
int read = -1;
MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read)
assert(nbytes == (uint64_t)read);
assert(err == MPI_SUCCESS);
GRID_ASSERT(nbytes == (uint64_t)read);
GRID_ASSERT(err == MPI_SUCCESS);
}
void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions();
@@ -132,20 +132,20 @@ public:
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
}
void freeTypes() {
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS);
}
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta);
CHECK
createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); GRID_ASSERT(err == MPI_SUCCESS);
CHECK
int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
@@ -166,7 +166,7 @@ public:
CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1");
assert(err == MPI_SUCCESS);
GRID_ASSERT(err == MPI_SUCCESS);
freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
@@ -182,7 +182,7 @@ public:
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid();
@@ -225,7 +225,7 @@ public:
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
GRID_ASSERT(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
@@ -246,7 +246,7 @@ private:
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) {
assert(node_buff.size() == Nd * grid->lSites());
GRID_ASSERT(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions();
@@ -274,7 +274,7 @@ private:
buff_idx += 2 * Nd;
}
assert(node_buff.size() == buff_idx);
GRID_ASSERT(node_buff.size() == buff_idx);
}
};
+5 -5
View File
@@ -51,8 +51,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#endif
#ifdef __x86_64__
#ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
//accelerator_inline uint64_t __rdtsc(void) { return 0; }
//accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
#include <x86intrin.h>
#endif
@@ -146,8 +146,8 @@ public:
PerformanceCounter(int _pct) {
#ifdef __linux__
assert(_pct>=0);
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
GRID_ASSERT(_pct>=0);
GRID_ASSERT(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
fd=-1;
cyclefd=-1;
count=0;
@@ -213,7 +213,7 @@ public:
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign==2*sizeof(long long));
GRID_ASSERT(ign==2*sizeof(long long));
}
elapsed = cyclecount() - begin;
#else
+1 -1
View File
@@ -150,7 +150,7 @@ void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
}
int type;
int ret = fscanf(fp, "%d", &type);
assert(ret == 1);
GRID_ASSERT(ret == 1);
fclose(fp);
// std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
+8 -4
View File
@@ -60,12 +60,16 @@ inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
}
inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now)
{
double secs = 1.0*now.count()*1.0e-3;
stream << secs<<" s";
/*
GridSecs second(1);
auto secs = now/second ;
auto subseconds = now%second ;
auto fill = stream.fill();
stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
stream.fill(fill);
*/
return stream;
}
inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
@@ -90,14 +94,14 @@ public:
Reset();
}
void Start(void) {
assert(running == false);
GRID_ASSERT(running == false);
#ifdef TIMERS_ON
start = GridClock::now();
#endif
running = true;
}
void Stop(void) {
assert(running == true);
GRID_ASSERT(running == true);
#ifdef TIMERS_ON
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
#endif
@@ -111,11 +115,11 @@ public:
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
}
GridTime Elapsed(void) const {
assert(running == false);
GRID_ASSERT(running == false);
return std::chrono::duration_cast<GridTime>( accumulator );
}
uint64_t useconds(void) const {
assert(running == false);
GRID_ASSERT(running == false);
return (uint64_t) accumulator.count();
}
bool isRunning(void) const {
+18
View File
@@ -596,16 +596,32 @@ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
//////////////////////////////////////////
// Trace lattice and non-lattice
//////////////////////////////////////////
#define GRID_UNOP(name) name
#define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
{ \
return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
}
template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
{
return traceIndex<SpinIndex>(lhs);
}
GridUnopClass(UnaryTraceSpin, traceIndex<SpinIndex>(a));
GRID_DEF_UNOP(traceSpin, UnaryTraceSpin);
template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
{
return traceIndex<ColourIndex>(lhs);
}
GridUnopClass(UnaryTraceColour, traceIndex<ColourIndex>(a));
GRID_DEF_UNOP(traceColour, UnaryTraceColour);
template<int Index,class vobj>
inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
{
@@ -617,6 +633,8 @@ inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIn
return traceIndex<ColourIndex>(lhs);
}
#undef GRID_UNOP
#undef GRID_DEF_UNOP
//////////////////////////////////////////
// Current types
//////////////////////////////////////////
+2 -2
View File
@@ -136,9 +136,9 @@ class EmptyAction : public Action <GaugeField>
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { GRID_ASSERT(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { GRID_ASSERT(0); }; // evaluate the action derivative
///////////////////////////////
// Logging
+1 -1
View File
@@ -77,7 +77,7 @@ public:
actions(std::get<0>(actions_hirep)), multiplier(mul) {
// initialize the hirep vectors to zero.
// apply(this->resize, actions_hirep, 0); //need a working resize
assert(mul >= 1);
GRID_ASSERT(mul >= 1);
}
template < class GenField >
+1 -1
View File
@@ -126,7 +126,7 @@ public:
// possible boost
std::vector<ComplexD> qmu;
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; GRID_ASSERT(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
// Cayley form Moebius (tanh and zolotarev)
+2 -2
View File
@@ -181,7 +181,7 @@ public:
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
GRID_ASSERT(0);
return lambda;
}
@@ -324,7 +324,7 @@ public:
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
GRID_ASSERT(0);
return lambda;
}
@@ -210,8 +210,8 @@ private:
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
const MaskField* m = getCorrectMaskField(f); GRID_ASSERT(m != nullptr);
GRID_ASSERT(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
@@ -164,8 +164,8 @@ private:
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
const MaskField* m = getCorrectMaskField(f); GRID_ASSERT(m != nullptr);
GRID_ASSERT(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
@@ -74,8 +74,8 @@ public:
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
GRID_ASSERT(twist.size() == Nd);//check that twist is Nd
GRID_ASSERT(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
+10 -10
View File
@@ -110,9 +110,9 @@ public:
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { assert(0);};
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { GRID_ASSERT(0);};
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ GRID_ASSERT(0);};
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ GRID_ASSERT(0);};
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
@@ -128,7 +128,7 @@ public:
void DhopOE(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Even);
GRID_ASSERT(in.Checkerboard()==Even);
Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag ); out = out + tmp;
@@ -137,7 +137,7 @@ public:
void DhopEO(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Odd);
GRID_ASSERT(in.Checkerboard()==Odd);
Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag ); out = out + tmp;
@@ -147,11 +147,11 @@ public:
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
void MdirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
void DhopDirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ GRID_ASSERT(0);};
void MdirAll(const FermionField &in, std::vector<FermionField> &out) { GRID_ASSERT(0);};
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { GRID_ASSERT(0);};
void DhopDirAll(const FermionField &in, std::vector<FermionField> &out) { GRID_ASSERT(0);};
void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { GRID_ASSERT(0);};
void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
{
+1 -1
View File
@@ -123,7 +123,7 @@ public:
RealD eps = 1.0;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls);
GRID_ASSERT(zdata->n==this->Ls);
// std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
// Call base setter

Some files were not shown because too many files have changed in this diff Show More