From ad2b699d2b495250da3b596603be0b62018e45d1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Sep 2023 16:12:21 -0400 Subject: [PATCH 01/50] Better macos --- systems/mac-arm/config-command-mpi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi index 84a656f4..674f9043 100644 --- a/systems/mac-arm/config-command-mpi +++ b/systems/mac-arm/config-command-mpi @@ -1,4 +1,4 @@ BREW=/opt/local/ -MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug +CXX=mpicxx-openmpi-mp ../../configure --enable-simd=GEN --enable-comms=mpi --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug From e29b97b3eac9b16bcb1e6f77ebd85c61e967b11e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Sep 2023 16:14:03 -0400 Subject: [PATCH 02/50] Qslash term added --- Grid/qcd/action/fermion/CayleyFermion5D.h | 5 +++ .../CayleyFermion5DImplementation.h | 40 ++++++++++++++++++- tests/solver/Test_dwf_cg_prec.cc | 8 +++- tests/solver/Test_dwf_cg_unprec.cc | 26 +++++++++++- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index cf39ec99..fb20a958 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -124,6 +124,11 @@ public: RealD _b; RealD _c; + // possible boost + std::vector qmu; + void set_qmu(std::vector _qmu) { qmu=_qmu; assert(qmu.size()==Nd);}; + void addQmu(const FermionField &in, FermionField &out, int dag); + // Cayley form Moebius (tanh and zolotarev) Vector omega; Vector bs; // S dependent coeffs diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 2b8a3a18..befdf0ed 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -48,7 +48,8 @@ CayleyFermion5D::CayleyFermion5D(GaugeField &_Umu, FourDimGrid, FourDimRedBlackGrid,_M5,p), mass_plus(_mass), mass_minus(_mass) -{ +{ + // qmu defaults to zero size; } /////////////////////////////////////////////////////////////// @@ -270,6 +271,34 @@ void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField M5Ddag(psi,psi,Din,lower,diag,upper); } +template +void CayleyFermion5D::addQmu(const FermionField &psi,FermionField &chi, int dag) +{ + if ( qmu.size() ) { + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + std::vector coeff(Nd); + ComplexD ci(0,1); + + assert(qmu.size()==Nd); + + for(int mu=0;mu void CayleyFermion5D::M (const FermionField &psi, FermionField &chi) { @@ -277,8 +306,12 @@ void CayleyFermion5D::M (const FermionField &psi, FermionField &chi) // Assemble Din Meooe5D(psi,Din); - + this->DW(Din,chi,DaggerNo); + + // add i q_mu gamma_mu here + addQmu(Din,chi,DaggerNo); + // ((b D_W + D_w hop terms +1) on s-diag axpby(chi,1.0,1.0,chi,psi); @@ -295,6 +328,9 @@ void CayleyFermion5D::Mdag (const FermionField &psi, FermionField &chi) FermionField Din(psi.Grid()); // Apply Dw this->DW(psi,Din,DaggerYes); + + // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not. + addQmu(psi,Din,DaggerYes); MeooeDag5D(Din,chi); diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index f4e346bf..af3f4cf0 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -1,4 +1,4 @@ -/************************************************************************************* +************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -67,7 +67,13 @@ int main(int argc, char** argv) { result = Zero(); LatticeGaugeField Umu(UGrid); +#if 0 + FieldMetaData header; + std::string file("ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); +#else SU::HotConfiguration(RNG4, Umu); +#endif std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/solver/Test_dwf_cg_unprec.cc b/tests/solver/Test_dwf_cg_unprec.cc index 58614c49..7435bfae 100644 --- a/tests/solver/Test_dwf_cg_unprec.cc +++ b/tests/solver/Test_dwf_cg_unprec.cc @@ -54,15 +54,30 @@ int main (int argc, char ** argv) GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + std::vector qmu; + qmu.push_back(ComplexD(0.1,0.0)); + qmu.push_back(ComplexD(0.0,0.0)); + qmu.push_back(ComplexD(0.0,0.0)); + qmu.push_back(ComplexD(0.0,0.01)); + + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + LatticeFermion tmp(FGrid); LatticeFermion src(FGrid); random(RNG5,src); LatticeFermion result(FGrid); result=Zero(); - LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); - + LatticeGaugeField Umu(UGrid); +#if 0 + FieldMetaData header; + std::string file("ckpoint_lat.4000"); + NerscIO::readConfiguration(Umu,header,file); +#else + SU::HotConfiguration(RNG4,Umu); +#endif + std::vector U(4,UGrid); for(int mu=0;mu(Umu,mu); @@ -71,8 +86,15 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.8; DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + Ddwf.qmu = qmu; + Ddwf.M(src,tmp); + std::cout << " |M src|^2 "< HermOp(Ddwf); + HermOp.HermOp(src,tmp); + + std::cout << " "< CG(1.0e-6,10000); CG(HermOp,src,result); From 622f78ebea56fa355afeb69db83ab2cfa2cfa2bc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 4 Sep 2024 13:53:48 +0000 Subject: [PATCH 03/50] SYCL updates -- operator = giving trouble on Aurora. SYCL reduction is failing intermittently with SVM interface - returns zero, expect non-zero. Think I need to remove ALL dependence on SVM. --- Grid/lattice/Lattice_base.h | 7 ++-- Grid/lattice/Lattice_reduction_sycl.h | 46 ++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9d4d3d5f..515c847f 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -236,10 +236,13 @@ public: template inline Lattice & operator = (const sobj & r){ vobj vtmp; vtmp = r; -#if defined(GRID_HIP) || defined(GRID_CUDA) || defined (GRID_SYCL) +#if 0 + deviceVector vvtmp(1); + acceleratorPut(vvtmp[0],vtmp); + vobj *vvtmp_p = & vvtmp[0]; auto me = View(AcceleratorWrite); accelerator_for(ss,me.size(),vobj::Nsimd(),{ - auto stmp=coalescedRead(vtmp); + auto stmp=coalescedRead(*vvtmp_p); coalescedWrite(me[ss],stmp); }); #else diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index b8dc5378..7dff7939 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -4,16 +4,36 @@ NAMESPACE_BEGIN(Grid); // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// + template inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; +#if 1 + sobj identity; zeroit(identity); + sobj ret; zeroit(ret); + Integer nsimd= vobj::Nsimd(); + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{osites}, + Reduction, + [=] (cl::sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + } + sobjD dret; convertType(dret,ret); + return dret; +#else static Vector mysum; mysum.resize(1); sobj *mysum_p = & mysum[0]; sobj identity; zeroit(identity); - mysum[0] = identity; + acceleratorPut(mysum[0],identity); sobj ret ; Integer nsimd= vobj::Nsimd(); @@ -33,6 +53,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os // free(mysum,*theGridAccelerator); sobjD dret; convertType(dret,ret); return dret; +#endif } template @@ -76,12 +97,28 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite template Word svm_xor(Word *vec,uint64_t L) { - Word xorResult; xorResult = 0; +#if 1 + Word identity; identity=0; + Word ret = 0; + { + sycl::buffer abuff(&ret, {1}); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(cl::sycl::range<1>{L}, + Reduction, + [=] (cl::sycl::id<1> index, auto &sum) { + sum ^=vec[index]; + }); + }); + } + theGridAccelerator->wait(); + return ret; +#else static Vector d_sum; d_sum.resize(1); Word *d_sum_p=&d_sum[0]; Word identity; identity=0; - d_sum[0] = identity; + acceleratorPut(d_sum[0],identity); const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); theGridAccelerator->submit([&](cl::sycl::handler &cgh) { auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); @@ -92,9 +129,10 @@ template Word svm_xor(Word *vec,uint64_t L) }); }); theGridAccelerator->wait(); - Word ret = d_sum[0]; + Word ret = acceleratorGet(d_sum[0]); // free(d_sum,*theGridAccelerator); return ret; +#endif } NAMESPACE_END(Grid); From 160969a7588884c27ae9bff9d74453b12c123ce2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Sep 2024 18:09:42 +0000 Subject: [PATCH 04/50] UVM tester, doesn't turn up anything --- tests/core/Test_uvm.cc | 106 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/core/Test_uvm.cc diff --git a/tests/core/Test_uvm.cc b/tests/core/Test_uvm.cc new file mode 100644 index 00000000..290aa975 --- /dev/null +++ b/tests/core/Test_uvm.cc @@ -0,0 +1,106 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_memory_manager.cc + + Copyright (C) 2022 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +const int64_t Pages=32; +const int64_t PageWords=4096/sizeof(ComplexD); +const int64_t VecWords=PageWords*Pages; +const int64_t N=10000; + +class Tester { +public: + Vector zero_uvm; + std::vector zero_host; + std::vector > A; + std::vector > B; + uint64_t counter; + + Tester() : + zero_uvm(VecWords,ComplexD(0.0)), + zero_host(VecWords,ComplexD(0.0)), + A(N,zero_uvm), + B(N,zero_host) + { counter = 0; } + + void MemoryTest(int N) + { + for(int epoch = 0;epoch<100000;epoch++){ + + int p = random() %Pages; // Which address/page to hit + int v = random() %N; // Which vec + int w = random() %2; // Write or read + int dev= random() %2; // On device? + // int e=1; + ComplexD zc = counter++; + + if ( w ) { + B[v][p*PageWords] = B[v][p*PageWords] + zc; + if ( dev ) { + ComplexD *A_v=&A[v][0]; + accelerator_for(ss,1,1,{ + A_v[p*PageWords] = A_v[p*PageWords] + zc; + }); + } else { + A[v][p*PageWords] = A[v][p*PageWords] + zc; + } + } else { + if ( dev ) { + ComplexD *A_v=&A[v][0]; + ComplexD ref = B[v][p*PageWords]; + std::cout << "Device compare "< Date: Tue, 10 Sep 2024 18:11:52 +0000 Subject: [PATCH 05/50] Aurora testing --- systems/Aurora/tests/repro16.pbs | 12 ++---- systems/Aurora/tests/repro1gpu.pbs | 10 +++-- systems/Aurora/tests/reproBigJob.pbs | 63 ++++++++++++++++++++++++++++ systems/Aurora/tests/reproN.pbs | 7 ++-- 4 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 systems/Aurora/tests/reproBigJob.pbs diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs index fa37ae09..5d5314c1 100644 --- a/systems/Aurora/tests/repro16.pbs +++ b/systems/Aurora/tests/repro16.pbs @@ -2,7 +2,8 @@ ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=01:00:00 #PBS -N dwf @@ -13,19 +14,14 @@ cd $PBS_O_WORKDIR -#source ../sourceme.sh +source ../sourceme.sh cat $PBS_NODEFILE -#export MPICH_COLL_SYNC=1 -#export MPICH_ENV_DISPLAY=1 -export MPICH_ export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu -#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH +#module load mpich/51.2/icc-all-deterministic-pmix-gpu #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST diff --git a/systems/Aurora/tests/repro1gpu.pbs b/systems/Aurora/tests/repro1gpu.pbs index 283a9343..f8e52705 100644 --- a/systems/Aurora/tests/repro1gpu.pbs +++ b/systems/Aurora/tests/repro1gpu.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=16:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N repro1gpu @@ -9,8 +10,9 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 @@ -34,6 +36,8 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" cd $PBS_O_WORKDIR +source ../sourceme.sh + NN=`cat $PBS_NODEFILE | wc -l` echo $PBS_NODEFILE cat $PBS_NODEFILE diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs new file mode 100644 index 00000000..205fefce --- /dev/null +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -0,0 +1,63 @@ +#!/bin/bash + +#PBS -l select=16 +#PBS -q EarlyAppAccess +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=02:00:00 +#PBS -N reproBigJob +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +DIR=reproBigJob.$PBS_JOBID + +mkdir -p $DIR +cd $DIR + +cp $PBS_NODEFILE nodefile + +CMD="mpiexec -np 192 -ppn 12 -envall --hostfile nodefile \ + ../gpu_tile_compact.sh \ + ../Test_dwf_mixedcg_prec --mpi 4.4.4.3 --grid 128.128.128.96 \ + --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + +echo $CMD > command-line +env > environment +$CMD +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* diff --git a/systems/Aurora/tests/reproN.pbs b/systems/Aurora/tests/reproN.pbs index 293e7ade..be10558b 100644 --- a/systems/Aurora/tests/reproN.pbs +++ b/systems/Aurora/tests/reproN.pbs @@ -1,6 +1,7 @@ #!/bin/bash -#PBS -l select=32:system=sunspot,place=scatter +#PBS -l select=16 +#PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 #PBS -N reproN @@ -9,8 +10,8 @@ #export OMP_PROC_BIND=spread #unset OMP_PLACES -module load oneapi/eng-compiler/2023.05.15.003 -module load mpich/51.2/icc-all-deterministic-pmix-gpu +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 From 066544281feef3f16baf6c88ec786865182f0caa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 17 Sep 2024 13:34:27 +0000 Subject: [PATCH 06/50] Deprecate UVM --- Grid/algorithms/FFT.h | 9 +- .../iterative/ConjugateGradientMultiShift.h | 10 +- .../ConjugateGradientMultiShiftCleanup.h | 10 +- .../ConjugateGradientMultiShiftMixedPrec.h | 10 +- Grid/algorithms/multigrid/CoarsenedMatrix.h | 67 +++-- Grid/allocator/AlignedAllocator.h | 34 +-- Grid/allocator/MemoryStats.cc | 4 +- Grid/cshift/Cshift.h | 1 - Grid/cshift/Cshift_common.h | 81 +----- Grid/cshift/Cshift_mpi.h | 274 ++---------------- Grid/cshift/Cshift_table.cc | 2 +- Grid/lattice/Lattice_basis.h | 61 ++-- Grid/lattice/Lattice_reduction.h | 35 +-- Grid/lattice/Lattice_reduction_gpu.h | 14 +- Grid/lattice/Lattice_reduction_sycl.h | 82 +----- Grid/lattice/Lattice_slicesum_core.h | 81 ++++-- Grid/lattice/PaddedCell.h | 8 +- Grid/qcd/action/fermion/CayleyFermion5D.h | 54 ++-- .../fermion/ContinuedFractionFermion5D.h | 12 +- .../action/fermion/DomainWallEOFAFermion.h | 6 +- .../action/fermion/ImprovedStaggeredFermion.h | 8 +- .../fermion/ImprovedStaggeredFermion5D.h | 5 - Grid/qcd/action/fermion/MobiusEOFAFermion.h | 22 +- .../action/fermion/NaiveStaggeredFermion.h | 9 +- .../action/fermion/PartialFractionFermion5D.h | 4 +- Grid/qcd/action/fermion/SchurDiagTwoKappa.h | 2 +- Grid/qcd/action/fermion/StaggeredKernels.h | 4 +- Grid/qcd/action/fermion/WilsonCompressor.h | 9 +- Grid/qcd/action/fermion/WilsonFermion.h | 16 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 6 - Grid/qcd/action/fermion/ZMobiusFermion.h | 2 +- .../CayleyFermion5Dvec.h | 4 + .../action/fermion/deprecated}/Lebesgue.cc | 2 + .../action/fermion/deprecated}/Lebesgue.h | 2 +- .../CayleyFermion5DImplementation.h | 48 +-- .../implementation/CayleyFermion5Dcache.h | 68 +++-- .../DomainWallEOFAFermionCache.h | 44 ++- .../DomainWallEOFAFermionImplementation.h | 26 +- ...ImprovedStaggeredFermion5DImplementation.h | 24 +- .../ImprovedStaggeredFermionImplementation.h | 24 +- .../implementation/MobiusEOFAFermionCache.h | 144 ++++++--- .../MobiusEOFAFermionImplementation.h | 32 +- .../NaiveStaggeredFermionImplementation.h | 24 +- .../implementation/StaggeredKernelsHand.h | 17 -- .../StaggeredKernelsImplementation.h | 4 +- .../WilsonFermion5DImplementation.h | 33 ++- .../WilsonFermionImplementation.h | 27 +- .../implementation/WilsonKernelsAsmAvx512.h | 8 +- .../WilsonKernelsImplementation.h | 2 +- Grid/qcd/representations/adjoint.h | 2 +- Grid/qcd/representations/two_index.h | 2 +- Grid/qcd/utils/A2Autils.h | 16 +- Grid/qcd/utils/BaryonUtils.h | 14 +- Grid/qcd/utils/SUnAdjoint.h | 2 +- Grid/stencil/GeneralLocalStencil.h | 4 +- Grid/stencil/SimpleCompressor.h | 23 +- Grid/stencil/Stencil.h | 60 ++-- Grid/threads/Accelerator.cc | 8 +- Grid/util/Init.cc | 24 -- benchmarks/Benchmark_ITT.cc | 5 - benchmarks/Benchmark_memory_asynch.cc | 2 +- benchmarks/Benchmark_mooee.cc | 6 +- benchmarks/Benchmark_usqcd.cc | 2 +- configure.ac | 22 -- systems/Aurora/benchmarks/bench1.pbs | 4 +- systems/Aurora/benchmarks/bench2.pbs | 6 +- systems/Aurora/sourceme.sh | 34 +-- systems/Aurora/tests/reproBigJob.pbs | 16 +- tests/core/Test_fft.cc | 1 + tests/core/Test_gparity.cc | 2 + tests/core/Test_gparity_flavour.cc | 6 +- tests/core/Test_gpwilson_even_odd.cc | 4 +- tests/core/Test_memory_manager.cc | 2 +- tests/core/Test_sliceSum.cc | 4 +- tests/sp2n/Test_2as_base.cc | 4 +- 75 files changed, 668 insertions(+), 1082 deletions(-) rename Grid/qcd/action/fermion/{implementation => deprecated}/CayleyFermion5Dvec.h (99%) rename Grid/{stencil => qcd/action/fermion/deprecated}/Lebesgue.cc (99%) rename Grid/{stencil => qcd/action/fermion/deprecated}/Lebesgue.h (97%) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 2cbc895c..3c2eb428 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -168,6 +168,7 @@ public: template void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){ #ifndef HAVE_FFTW + std::cerr << "FFTW is not compiled but is called"< pgbuf(&pencil_g); autoView(pgbuf_v , pgbuf, CpuWrite); - + std::cout << "CPU view" << std::endl; + typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -213,6 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); + std::cout << "Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -226,6 +229,7 @@ public: } // Barrel shift and collect global pencil + std::cout << "Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -247,6 +251,7 @@ public: } } + std::cout << "Looping orthog" << std::endl; // Loop over orthog coords int NN=pencil_g.lSites(); GridStopWatch timer; @@ -269,6 +274,7 @@ public: usec += timer.useconds(); flops+= flops_call*NN; + std::cout << "Writing back results " << std::endl; // writing out result { autoView(pgbuf_v,pgbuf,CpuRead); @@ -285,6 +291,7 @@ public: } result = result*div; + std::cout << "Destroying plan " << std::endl; // destroying plan FFTW::fftw_destroy_plan(p); #endif diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h index d41eb279..e00e94c9 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h @@ -102,11 +102,11 @@ public: assert(mass.size()==nshift); assert(mresidual.size()==nshift); - // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + // remove dynamic sized arrays on stack; 2d is a pain with vector + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h index 23baff61..c6102eb2 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h @@ -123,11 +123,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index d3fb282a..24a3228a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -156,11 +156,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/multigrid/CoarsenedMatrix.h b/Grid/algorithms/multigrid/CoarsenedMatrix.h index 42634004..60a5920c 100644 --- a/Grid/algorithms/multigrid/CoarsenedMatrix.h +++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h @@ -99,7 +99,7 @@ public: CoarseMatrix AselfInvEven; CoarseMatrix AselfInvOdd; - Vector dag_factor; + deviceVector dag_factor; /////////////////////// // Interface @@ -124,9 +124,13 @@ public: int npoint = geom.npoint; typedef LatticeView Aview; - Vector AcceleratorViewContainer; + deviceVector AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); - for(int p=0;p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;poSites(); - Vector points(geom.npoint, 0); - for(int p=0; p points(geom.npoint); + for(int p=0; p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p &out) { @@ -469,14 +484,20 @@ public: // determine in what order we need the points int npoint = geom.npoint-1; - Vector points(npoint, 0); - for(int p=0; p points(npoint); + for(int p=0; p AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p h_dag_factor(nbasis*nbasis); thread_for(i, nbasis*nbasis, { int j = i/nbasis; int k = i%nbasis; - dag_factor[i] = dag_factor_eigen(j, k); + h_dag_factor[i] = dag_factor_eigen(j, k); }); + acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD)); } void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 293ce2fb..8946a364 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -174,21 +174,11 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -#ifdef ACCELERATOR_CSHIFT -// Cshift on device -template using cshiftAllocator = devAllocator; -#else -// Cshift on host -template using cshiftAllocator = std::allocator; -#endif +template using hostVector = std::vector >; // Needs autoview +template using Vector = std::vector >; // +template using uvmVector = std::vector >; // auto migrating page +template using deviceVector = std::vector >; // device vector -template using Vector = std::vector >; -template using stencilVector = std::vector >; -template using commVector = std::vector >; -template using deviceVector = std::vector >; -template using cshiftVector = std::vector >; - -/* template class vecView { protected: @@ -197,8 +187,9 @@ template class vecView ViewMode mode; void * cpu_ptr; public: + // Rvalue accessor accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; - vecView(std::vector &refer_to_me,ViewMode _mode) + vecView(Vector &refer_to_me,ViewMode _mode) { cpu_ptr = &refer_to_me[0]; size = refer_to_me.size(); @@ -214,26 +205,15 @@ template class vecView } }; -template vecView VectorView(std::vector &vec,ViewMode _mode) +template vecView VectorView(Vector &vec,ViewMode _mode) { vecView ret(vec,_mode); // does the open return ret; // must be closed } -// Little autoscope assister -template -class VectorViewCloser -{ - View v; // Take a copy of view and call view close when I go out of scope automatically - public: - VectorViewCloser(View &_v) : v(_v) {}; - ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);} -}; - #define autoVecView(v_v,v,mode) \ auto v_v = VectorView(v,mode); \ ViewCloser _autoView##v_v(v_v); -*/ NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc index 0d1707d9..37269785 100644 --- a/Grid/allocator/MemoryStats.cc +++ b/Grid/allocator/MemoryStats.cc @@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES) uint64_t virt_pfn = (uint64_t)Buf / page_size; off_t offset = sizeof(uint64_t) * virt_pfn; uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; + std::vector pagedata(npages); uint64_t ret = lseek(fd, offset, SEEK_SET); assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); assert(ret == sizeof(uint64_t) * npages); int nhugepages = npages / 512; int n4ktotal, nnothuge; diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h index c7b9e3cb..ae1dea51 100644 --- a/Grid/cshift/Cshift.h +++ b/Grid/cshift/Cshift.h @@ -51,7 +51,6 @@ Author: Peter Boyle #endif NAMESPACE_BEGIN(Grid); - template::value,void>::type * = nullptr> auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 309517b2..fdb98cd4 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -30,12 +30,11 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); extern std::vector > Cshift_table; -extern commVector > Cshift_table_device; +extern deviceVector > Cshift_table_device; inline std::pair *MapCshiftTable(void) { // GPU version -#ifdef ACCELERATOR_CSHIFT uint64_t sz=Cshift_table.size(); if (Cshift_table_device.size()!=sz ) { Cshift_table_device.resize(sz); @@ -45,16 +44,13 @@ inline std::pair *MapCshiftTable(void) sizeof(Cshift_table[0])*sz); return &Cshift_table_device[0]; -#else - return &Cshift_table[0]; -#endif // CPU version use identify map } /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,deviceVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for(i,ent,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - int o = n*n1; - int offset = b+n*e2; - - vobj temp =rhs_v[so+o+b]; - extract(temp,pointers,offset); - }); -#endif } else { Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - - Coordinate coor; - - int o=n*n1; - int oindex = o+b; - - int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); - - int ocb=1<(temp,pointers,offset); - } - }); -#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,deviceVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -245,17 +202,10 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); }); -#else - autoView( rhs_v, rhs, CpuWrite); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer_p[table[i].second]; - }); -#endif } } @@ -278,7 +228,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -287,14 +236,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); -#else - autoView( rhs_v , rhs, CpuWrite); - thread_for2d(n,e1,b,e2,{ - int o = n*_slice_stride; - int offset = b+n*_slice_block; - merge(rhs_v[so+o+b],pointers,offset); - }); -#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -360,19 +301,11 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs { auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - autoView(lhs_v , lhs, CpuWrite); - thread_for(i,ent,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -412,19 +345,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift(const Lattice &rhs,int dimension RealD t1,t0; t0=usecond(); if ( !comm_dim ) { - //std::cout << "CSHIFT: Cshift_local" < void Cshift_comms(Lattice& ret,const Lattice &r sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - // std::cout << "Cshift_comms dim "< void Cshift_comms_simd(Lattice& ret,const LatticeCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - //std::cout << "Cshift_comms_simd dim "< void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -125,8 +123,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf; send_buf.resize(buffer_size); - static cshiftVector recv_buf; recv_buf.resize(buffer_size); + static deviceVector send_buf; send_buf.resize(buffer_size); + static deviceVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -161,7 +159,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, @@ -169,7 +167,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); xbytes+=bytes; - // grid->Barrier(); + grid->Barrier(); tcomms+=usecond(); tscatter-=usecond(); @@ -177,13 +175,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r tscatter+=usecond(); } } - /* std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -201,9 +197,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_simd_layout[dimension]; int comm_dim = grid->_processors[dimension] >1 ; - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "< void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -281,7 +277,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -292,7 +288,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + grid->Barrier(); tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; @@ -305,242 +301,12 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - GridBase *grid=rhs.Grid(); - Lattice temp(rhs.Grid()); - - int fd = rhs.Grid()->_fdimensions[dimension]; - int rd = rhs.Grid()->_rdimensions[dimension]; - int pd = rhs.Grid()->_processors[dimension]; - int simd_layout = rhs.Grid()->_simd_layout[dimension]; - int comm_dim = rhs.Grid()->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); - static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); - vobj *send_buf; - vobj *recv_buf; - { - grid->ShmBufferFreeAll(); - size_t bytes = buffer_size*sizeof(vobj); - send_buf=(vobj *)grid->ShmBufferMalloc(bytes); - recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); - } - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(vobj); - - tgather-=usecond(); - Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); - tgather+=usecond(); - - // int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); - grid->SendToRecvFrom((void *)&send_buf[0], - xmit_to_rank, - (void *)&recv_buf[0], - recv_from_rank, - bytes); - xbytes+=bytes; - acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - - // grid->Barrier(); - tcomms+=usecond(); - - tscatter-=usecond(); - Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); - tscatter+=usecond(); - } - } - /* - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - GridBase *grid=rhs.Grid(); - const int Nsimd = grid->Nsimd(); - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - - int fd = grid->_fdimensions[dimension]; - int rd = grid->_rdimensions[dimension]; - int ld = grid->_ldimensions[dimension]; - int pd = grid->_processors[dimension]; - int simd_layout = grid->_simd_layout[dimension]; - int comm_dim = grid->_processors[dimension] >1 ; - - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; - // int words = sizeof(vobj)/sizeof(vector_type); - - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); - scalar_object * recv_buf_extract_mpi; - scalar_object * send_buf_extract_mpi; - { - size_t bytes = sizeof(scalar_object)*buffer_size; - grid->ShmBufferFreeAll(); - send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - } - for(int s=0;s pointers(Nsimd); // - ExtractPointerArray rpointers(Nsimd); // received pointers - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - int cb = (cbmask==0x2)? Odd : Even; - int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer - int nbr_ox = (nbr_coor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - int recv_from_rank; - int xmit_to_rank; - - if (nbr_ic) nbr_lane|=inner_bit; - - assert (sx == nbr_ox); - - if(nbr_proc){ - grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); - grid->SendToRecvFrom((void *)send_buf_extract_mpi, - xmit_to_rank, - (void *)recv_buf_extract_mpi, - recv_from_rank, - bytes); - acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); - xbytes+=bytes; - - // grid->Barrier(); - tcomms+=usecond(); - rpointers[i] = &recv_buf_extract[i][0]; - } else { - rpointers[i] = &send_buf_extract[nbr_lane][0]; - } - - } - tscatter-=usecond(); - Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - tscatter+=usecond(); - - } - /* - std::cout << GridLogPerformance << " Cshift (s) copy "< NAMESPACE_BEGIN(Grid); std::vector > Cshift_table; -commVector > Cshift_table_device; +deviceVector > Cshift_table_device; NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 03a869fb..c9c65928 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) typedef decltype(basis[0]) Field; typedef decltype(basis[0].View(AcceleratorRead)) View; - Vector basis_v; basis_v.reserve(basis.size()); - typedef typename std::remove_reference::type vobj; + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); + typedef typename std::remove_reference::type vobj; typedef typename std::remove_reference::type Coeff_t; + GridBase* grid = basis[0].Grid(); for(int k=0;k Bt(Nm * max_threads); - thread_region - { - vobj* B = &Bt[Nm * thread_num()]; - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - Vector Bt(siteBlock * nrot); + deviceVector Bt(siteBlock * nrot); auto Bp=&Bt[0]; // GPU readable copy of matrix - Vector Qt_jv(Nm*Nm); + hostVector h_Qt_jv(Nm*Nm); + deviceVector Qt_jv(Nm*Nm); Coeff_t *Qt_p = & Qt_jv[0]; thread_for(i,Nm*Nm,{ int j = i/Nm; int k = i%Nm; - Qt_p[i]=Qt(j,k); + h_Qt_jv[i]=Qt(j,k); }); + acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t)); // Block the loop to keep storage footprint down for(uint64_t s=0;s &basis,Eigen::MatrixXd& Qt,in result.Checkerboard() = basis[0].Checkerboard(); - Vector basis_v; basis_v.reserve(basis.size()); + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ vobj zzz=Zero(); @@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); - for(int k=0;k diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 58004eac..92eb0562 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) // const int Nsimd = vobj::Nsimd(); const int nthread = GridThread::GetThreads(); - Vector sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i &z,sobj a,sobj b,const Lattice &x,const Latt autoView( x_v, x, AcceleratorRead); autoView( y_v, y, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); -#if 0 - typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); - }); - nrm = real(TensorRemove(sum(inner_tmp_v,sites))); -#else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; deviceVector inner_tmp; inner_tmp.resize(sites); @@ -366,7 +354,6 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt coalescedWrite(z_v[ss],tmp); }); nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); -#endif grid->GlobalSum(nrm); return nrm; } @@ -377,7 +364,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti conformable(left,right); typedef typename vobj::vector_typeD vector_type; - Vector tmp(2); + std::vector tmp(2); GridBase *grid = left.Grid(); @@ -387,8 +374,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti // GPU typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t; - Vector inner_tmp(sites); - Vector norm_tmp(sites); + deviceVector inner_tmp(sites); + deviceVector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; { @@ -438,7 +425,9 @@ inline auto sum(const LatticeTrinaryExpression & expr) // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) +template inline void sliceSum(const Lattice &Data, + std::vector &result, + int orthogdim) { /////////////////////////////////////////////////////// // FIXME precision promoted summation @@ -460,8 +449,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node @@ -552,8 +541,8 @@ static void sliceInnerProductVector( std::vector & result, const Latti int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars ExtractBuffer > extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index e82494f5..91cb8226 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi // Move out of UVM // Turns out I had messed up the synchronise after move to compute stream // as running this on the default stream fools the synchronise -#undef UVM_BLOCK_BUFFER -#ifndef UVM_BLOCK_BUFFER - commVector buffer(numBlocks); + deviceVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); -#else - Vector buffer(numBlocks); - sobj *buffer_v = &buffer[0]; - sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); - accelerator_barrier(); - result = *buffer_v; -#endif return result; } @@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi const int words = sizeof(vobj)/sizeof(vector); - Vector buffer(osites); + deviceVector buffer(osites); vector *dat = (vector *)lat; vector *buf = &buffer[0]; iScalar *tbuf =(iScalar *) &buffer[0]; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 7dff7939..3718e6ea 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -10,7 +10,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; -#if 1 + sobj identity; zeroit(identity); sobj ret; zeroit(ret); Integer nsimd= vobj::Nsimd(); @@ -28,32 +28,6 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os } sobjD dret; convertType(dret,ret); return dret; -#else - static Vector mysum; - mysum.resize(1); - sobj *mysum_p = & mysum[0]; - sobj identity; zeroit(identity); - acceleratorPut(mysum[0],identity); - sobj ret ; - - Integer nsimd= vobj::Nsimd(); - - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{osites}, - Reduction, - [=] (cl::sycl::id<1> item, auto &sum) { - auto osite = item[0]; - sum +=Reduce(lat[osite]); - }); - }); - theGridAccelerator->wait(); - ret = mysum[0]; - // free(mysum,*theGridAccelerator); - sobjD dret; convertType(dret,ret); - return dret; -#endif } template @@ -97,7 +71,6 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite template Word svm_xor(Word *vec,uint64_t L) { -#if 1 Word identity; identity=0; Word ret = 0; { @@ -113,60 +86,7 @@ template Word svm_xor(Word *vec,uint64_t L) } theGridAccelerator->wait(); return ret; -#else - static Vector d_sum; - d_sum.resize(1); - Word *d_sum_p=&d_sum[0]; - Word identity; identity=0; - acceleratorPut(d_sum[0],identity); - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{L}, - Reduction, - [=] (cl::sycl::id<1> index, auto &sum) { - sum^=vec[index]; - }); - }); - theGridAccelerator->wait(); - Word ret = acceleratorGet(d_sum[0]); - // free(d_sum,*theGridAccelerator); - return ret; -#endif } NAMESPACE_END(Grid); -/* - -template -inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) -{ - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_type scalar; - - typedef typename vobj::scalar_typeD scalarD; - typedef typename vobj::scalar_objectD sobjD; - - sobjD ret; - scalarD *ret_p = (scalarD *)&ret; - - const int nsimd = vobj::Nsimd(); - const int words = sizeof(vobj)/sizeof(vector); - - Vector buffer(osites*nsimd); - scalar *buf = &buffer[0]; - vector *dat = (vector *)lat; - - for(int w=0;w inline void sliceSumReduction_cub_small(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_cub_small(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ size_t subvol_size = e1*e2; - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; vobj zero_init; zeroit(zero_init); @@ -94,7 +103,15 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V #if defined(GRID_SYCL) -template inline void sliceSumReduction_sycl_small(const vobj *Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_sycl_small(const vobj *Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { size_t subvol_size = e1*e2; @@ -105,7 +122,7 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, mysum[r] = vobj_zero; } - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; @@ -144,14 +161,23 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, } #endif -template inline void sliceSumReduction_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_large(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ typedef typename vobj::vector_type vector; const int words = sizeof(vobj)/sizeof(vector); const int osites = rd*e1*e2; - commVectorbuffer(osites); + deviceVectorbuffer(osites); vector *dat = (vector *)Data; vector *buf = &buffer[0]; - Vector lvSum_small(rd); + std::vector lvSum_small(rd); vector *lvSum_ptr = (vector *)&lvSum[0]; for (int w = 0; w < words; w++) { @@ -168,13 +194,18 @@ template inline void sliceSumReduction_large(const vobj *Data, Vecto for (int r = 0; r < rd; r++) { lvSum_ptr[w+words*r]=lvSum_small[r]; } - } - - } -template inline void sliceSumReduction_gpu(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +template +inline void sliceSumReduction_gpu(const Lattice &Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) { autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. if constexpr (sizeof(vobj) <= 256) { @@ -192,7 +223,15 @@ template inline void sliceSumReduction_gpu(const Lattice &Data } -template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_cpu(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction @@ -208,16 +247,20 @@ template inline void sliceSumReduction_cpu(const Lattice &Data }); } -template inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template inline void sliceSumReduction(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { - #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) - +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #else +#else sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #endif +#endif } diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index ad1496f5..c7dcbac9 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, +template inline void ScatterSlice(const deviceVector &buf, Lattice &lat, int x, int dim, @@ -140,7 +140,7 @@ template inline void ScatterSlice(const cshiftVector &buf, }); } -template inline void GatherSlice(cshiftVector &buf, +template inline void GatherSlice(deviceVector &buf, const Lattice &lat, int x, int dim, @@ -462,8 +462,8 @@ public: int rNsimd = Nsimd / simd[dimension]; assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); - static cshiftVector send_buf; - static cshiftVector recv_buf; + static deviceVector send_buf; + static deviceVector recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index cf39ec99..2c56c7ed 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -90,16 +90,16 @@ public: void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); virtual void Instantiatable(void)=0; @@ -119,35 +119,35 @@ public: RealD mass_plus, mass_minus; // Save arguments to SetCoefficientsInternal - Vector _gamma; + std::vector _gamma; RealD _zolo_hi; RealD _b; RealD _c; // Cayley form Moebius (tanh and zolotarev) - Vector omega; - Vector bs; // S dependent coeffs - Vector cs; - Vector as; + std::vector omega; + std::vector bs; // S dependent coeffs + std::vector cs; + std::vector as; // For preconditioning Cayley form - Vector bee; - Vector cee; - Vector aee; - Vector beo; - Vector ceo; - Vector aeo; + std::vector bee; + std::vector cee; + std::vector aee; + std::vector beo; + std::vector ceo; + std::vector aeo; // LDU factorisation of the eeoo matrix - Vector lee; - Vector leem; - Vector uee; - Vector ueem; - Vector dee; + std::vector lee; + std::vector leem; + std::vector uee; + std::vector ueem; + std::vector dee; // Matrices of 5d ee inverse params - Vector > MatpInv; - Vector > MatmInv; - Vector > MatpInvDag; - Vector > MatmInvDag; + // std::vector > MatpInv; + // std::vector > MatmInv; + // std::vector > MatpInvDag; + // std::vector > MatmInvDag; /////////////////////////////////////////////////////////////// // Conserved current utilities @@ -187,7 +187,7 @@ public: protected: virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); - virtual void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c); + virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 2300afd3..bfc0dd8b 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -90,12 +90,12 @@ protected: RealD mass; RealD R; RealD ZoloHiInv; - Vector Beta; - Vector cc;; - Vector cc_d;; - Vector sqrt_cc; - Vector See; - Vector Aee; + std::vector Beta; + std::vector cc;; + std::vector cc_d;; + std::vector sqrt_cc; + std::vector See; + std::vector Aee; }; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h index bcc97176..ff2420d5 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h @@ -69,10 +69,10 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); virtual void RefreshShiftCoefficients(RealD new_shift); @@ -83,7 +83,7 @@ public: RealD _M5, const ImplParams& p=ImplParams()); protected: - void SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c); + void SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 60cfc727..f7655f24 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -164,8 +164,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 5b26b35c..2641a6b8 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -100,7 +100,6 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -108,7 +107,6 @@ public: int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -116,7 +114,6 @@ public: int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -192,8 +189,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h index 6e4f79eb..39c21643 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h @@ -42,11 +42,11 @@ public: public: // Shift operator coefficients for red-black preconditioned Mobius EOFA - Vector Mooee_shift; - Vector MooeeInv_shift_lc; - Vector MooeeInv_shift_norm; - Vector MooeeInvDag_shift_lc; - Vector MooeeInvDag_shift_norm; + std::vector Mooee_shift; + std::vector MooeeInv_shift_lc; + std::vector MooeeInv_shift_norm; + std::vector MooeeInvDag_shift_lc; + std::vector MooeeInvDag_shift_norm; virtual void Instantiatable(void) {}; @@ -74,18 +74,18 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); virtual void RefreshShiftCoefficients(RealD new_shift); diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 5f69c2b1..9ec6be90 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -152,9 +152,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 54f8547f..e50a9922 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -94,8 +94,8 @@ protected: RealD R; RealD amax; RealD scale; - Vector p; - Vector q; + std::vector p; + std::vector q; }; diff --git a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h index 1545c245..00ac222f 100644 --- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h +++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h @@ -35,7 +35,7 @@ template class KappaSimilarityTransform { public: INHERIT_IMPL_TYPES(Matrix); - Vector kappa, kappaDag, kappaInv, kappaInvDag; + std::vector kappa, kappaDag, kappaInv, kappaInvDag; KappaSimilarityTransform (Matrix &zmob) { for (int i=0;i<(int)zmob.bs.size();i++) { diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index d67105bb..c609be03 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -49,10 +49,10 @@ template class StaggeredKernels : public FermionOperator , pub public: - void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior); - void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 186fa278..baa1f684 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -47,7 +47,7 @@ public: static int PartialCompressionFactor(GridBase *grid) { return 1;} #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -109,7 +109,7 @@ public: // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. //////////////////////////////////////////////////////////////////////////////////////////// template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -197,7 +197,7 @@ public: #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -208,7 +208,7 @@ public: else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -402,7 +402,6 @@ public: typedef CartesianStencil Base; typedef typename Base::View_type View_type; - typedef typename Base::StencilVector StencilVector; // Vector surface_list; WilsonStencil(GridBase *grid, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index a7a1bb69..16320a93 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -126,14 +126,17 @@ public: void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, + DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, @@ -168,9 +171,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - WilsonAnisotropyCoefficients anisotropyCoeff; /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 0b07d320..d614b290 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -135,21 +135,18 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, @@ -203,9 +200,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index fc8a7439..f8d1f11f 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -58,7 +58,7 @@ public: { // RealD eps = 1.0; std::cout< zgamma(this->Ls); + std::vector zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; } diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h similarity index 99% rename from Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h rename to Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h index e3bf67db..478fbb8b 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h @@ -1,3 +1,5 @@ +#if 0 + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -818,3 +820,5 @@ CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi, } NAMESPACE_END(Grid); + +#endif diff --git a/Grid/stencil/Lebesgue.cc b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc similarity index 99% rename from Grid/stencil/Lebesgue.cc rename to Grid/qcd/action/fermion/deprecated/Lebesgue.cc index 656ecca8..480483ed 100644 --- a/Grid/stencil/Lebesgue.cc +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc @@ -1,3 +1,4 @@ +#if 0 /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void) } NAMESPACE_END(Grid); +#endif diff --git a/Grid/stencil/Lebesgue.h b/Grid/qcd/action/fermion/deprecated/Lebesgue.h similarity index 97% rename from Grid/stencil/Lebesgue.h rename to Grid/qcd/action/fermion/deprecated/Lebesgue.h index 25fa772e..0416ad80 100644 --- a/Grid/stencil/Lebesgue.h +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.h @@ -72,7 +72,7 @@ public: void ThreadInterleave(void); private: - Vector _LebesgueReorder; + deviceVector _LebesgueReorder; }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 2b8a3a18..8dc4fbc8 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -156,18 +156,18 @@ template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag (Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; - Vector lower(Ls,-1.0); lower[0] =mass_plus; + std::vector diag (Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; + std::vector lower(Ls,-1.0); lower[0] =mass_plus; M5D(psi,chi,chi,lower,diag,upper); } template void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag = bs; - Vector upper= cs; - Vector lower= cs; + std::vector diag = bs; + std::vector upper= cs; + std::vector lower= cs; upper[Ls-1]=-mass_minus*upper[Ls-1]; lower[0] =-mass_plus*lower[0]; M5D(psi,psi,Din,lower,diag,upper); @@ -176,9 +176,9 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = beo; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = beo; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::Mooee (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::MooeeDag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for (int s=0;s void CayleyFermion5D::M5Ddag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); - Vector lower(Ls,-1.0); + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); + std::vector lower(Ls,-1.0); upper[Ls-1]=-mass_plus*upper[Ls-1]; lower[0] =-mass_minus*lower[0]; M5Ddag(psi,chi,chi,lower,diag,upper); @@ -248,9 +248,9 @@ template void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag =bs; - Vector upper=cs; - Vector lower=cs; + std::vector diag =bs; + std::vector upper=cs; + std::vector lower=cs; for (int s=0;s::MeoDeriv(GaugeField &mat,const FermionField &U,const template void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(1.0,gamma,b,c); } @@ -402,13 +402,13 @@ void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re template void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(zolo_hi,gamma,b,c); } //Zolo template -void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) +void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c) { int Ls=this->Ls; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 0d2516c4..d3d88cbf 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -43,9 +43,9 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); @@ -55,12 +55,16 @@ CayleyFermion5D::M5D(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls =this->Ls; + static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) uint64_t nloop = grid->oSites(); @@ -82,9 +86,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); @@ -93,12 +97,16 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls=this->Ls; + static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -126,11 +134,17 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -182,11 +196,17 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi autoView(psi , psi_i,AcceleratorRead); autoView(chi , chi_i,AcceleratorWrite); - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; assert(psi.Checkerboard() == psi.Checkerboard()); diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 6b8336cc..8a9a0ffa 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards.. template void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; @@ -50,9 +50,15 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi autoView( psi , psi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol auto nloop=grid->oSites()/Ls; @@ -73,7 +79,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -83,9 +89,14 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio autoView( phi , phi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -114,13 +125,18 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; - auto plee = & this->lee[0]; - auto pdee = & this->dee[0]; - auto puee = & this->uee[0]; - - auto pleem = & this->leem[0]; - auto pueem = & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h index 64ee4033..53b44ca2 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h @@ -131,9 +131,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, FermionField& chi else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; #if(0) std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; @@ -168,9 +168,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, FermionField& else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -181,9 +181,9 @@ void DomainWallEOFAFermion::Mooee(const FermionField& psi, FermionField& c { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -200,9 +200,9 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -218,7 +218,7 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField //Zolo template -void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c) +void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c) { int Ls = this->Ls; int pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index d235abbb..d2b4450e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian UUUmu(&FourDimGrid), UUUmuEven(&FourDimRedBlackGrid), UUUmuOdd(&FourDimRedBlackGrid), - Lebesgue(&FourDimGrid), - LebesgueEvenOdd(&FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid) { @@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, /*CHANGE */ template -void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommsMerge(compressor); @@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } /*CHANGE END*/ @@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag); } template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag); } template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) @@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); + DhopInternal(Stencil,Umu,UUUmu,in,out,dag); } ///////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 4c80a1d5..bd9dd132 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -339,7 +337,7 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag); + DhopInternal(Stencil, Umu, UUUmu, in, out, dag); } template @@ -351,7 +349,7 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag); } template @@ -363,7 +361,7 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag); } template @@ -394,19 +392,19 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel template -void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -429,7 +427,7 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -440,13 +438,13 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -460,7 +458,7 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 617a18df..4827e516 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid); template void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -50,9 +50,13 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -74,8 +78,8 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField template void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -86,13 +90,18 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - + assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + auto pshift_coeffs = &d_shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -119,7 +128,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion template void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -130,9 +139,13 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -154,8 +167,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie template void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -167,10 +180,15 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + auto pshift_coeffs = &d_shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol auto pm = this->pm; @@ -212,11 +230,17 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } @@ -268,14 +292,24 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); + // Move into object and constructor + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; - auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + + static deviceVector d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); + static deviceVector d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); + auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0]; + auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -333,11 +367,17 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -386,14 +426,28 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; - auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + + static deviceVector d_MooeeInvDag_shift_lc(Ls); + static deviceVector d_MooeeInvDag_shift_norm(Ls); + acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); + auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0]; + auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0]; + + // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; + // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h index 9b9db178..70f06dfc 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h @@ -196,9 +196,9 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } @@ -212,9 +212,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -230,9 +230,9 @@ void MobiusEOFAFermion::Mooee(const FermionField& psi, FermionField& chi) int Ls = this->Ls; // coefficients of Mooee - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; lower[s] = -this->cee[s]; @@ -253,9 +253,9 @@ void MobiusEOFAFermion::MooeeDag(const FermionField& psi, FermionField& ch int Ls = this->Ls; // coefficients of MooeeDag - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s+1]; @@ -314,10 +314,10 @@ void MobiusEOFAFermion::SetCoefficientsPrecondShiftOps() // Tridiagonal solve for MooeeInvDag_shift_lc { Coeff_t m(0.0); - Vector d = Mooee_shift; - Vector u(Ls,0.0); - Vector y(Ls,0.0); - Vector q(Ls,0.0); + std::vector d = Mooee_shift; + std::vector u(Ls,0.0); + std::vector y(Ls,0.0); + std::vector q(Ls,0.0); if(pm == 1){ u[0] = 1.0; } else{ u[Ls-1] = 1.0; } diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h index bf23d99d..b596dc44 100644 --- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -268,7 +266,7 @@ void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -280,7 +278,7 @@ void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &o assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -292,7 +290,7 @@ void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &o assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -323,18 +321,18 @@ void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField & template -void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -356,7 +354,7 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=1; int exterior=0; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -367,12 +365,12 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=0; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } } template -void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -385,7 +383,7 @@ void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Lebes { int interior=1; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index 2b6087bc..04337671 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -375,23 +375,6 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, } } -/* -#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ -*/ #undef LOAD_CHI #undef HAND_DECLARATIONS diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index a39b529f..05dbf3b2 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -256,7 +256,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie }); template -void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { @@ -294,7 +294,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, assert(0 && " Kernel optimisation case not covered "); } template -void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 95af4c38..2ad48926 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -58,15 +58,9 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Umu(_FourDimGrid), UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), - Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; - // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); @@ -305,19 +299,19 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, } template -void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { @@ -331,10 +325,12 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// { + std::cout << " WilsonFermion5D gather " < > requests; auto id=traceStart("Communicate overlapped"); st.CommunicateBegin(requests); @@ -343,6 +339,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Overlap with comms ///////////////////////////// { + std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // do the compute interior ///////////////////////////// + std::cout << " WilsonFermion5D Interior " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // Complete comms ///////////////////////////// + std::cout << " WilsonFermion5D Comms Complete " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // do the compute exterior ///////////////////////////// { + std::cout << " WilsonFermion5D Comms Merge " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg GRID_TRACE("DhopExterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); } + std::cout << " WilsonFermion5D Done " < -void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) @@ -395,11 +397,13 @@ void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOr int LLs = in.Grid()->_rdimensions[0]; + std::cout << " WilsonFermion5D Halo exch " <::DhopInternalSerialComms(StencilImpl & st, LebesgueOr GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } + std::cout << " WilsonFermion5D Done " <::DhopOE(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,in,out,dag); } template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -431,7 +436,7 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,in,out,dag); } template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) @@ -441,7 +446,7 @@ void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int d out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,in,out,dag); + DhopInternal(Stencil,Umu,in,out,dag); } template void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 1a262533..8c58f692 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -52,17 +52,12 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), _tmp(&Hgrid), anisotropyCoeff(anis) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ @@ -314,7 +309,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -326,7 +321,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -338,7 +333,7 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -391,21 +386,21 @@ void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out, }; template -void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else #endif - DhopInternalSerial(st,lo,U,in,out,dag); + DhopInternalSerial(st,U,in,out,dag); } template -void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -474,10 +469,10 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO template -void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, int dag) +void WilsonFermion::DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { GRID_TRACE("DhopSerial"); assert((dag == DaggerNo) || (dag == DaggerYes)); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h index e025ba41..2633c127 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h @@ -40,11 +40,11 @@ Author: paboyle /// Switch off the 5d vectorised code optimisations #undef DWFVEC5D -static Vector signsF; +static std::vector signsF; template - int setupSigns(Vector& signs ){ - Vector bother(2); + int setupSigns(std::vector& signs ){ + std::vector bother(2); signs = bother; vrsign(signs[0]); visign(signs[1]); @@ -364,7 +364,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled #include -static Vector signsD; +static std::vector signsD; static int signInitD = setupSigns(signsD); #define MAYBEPERM(A,perm) if (perm) { A ; } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 90defc54..43662b9c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -434,7 +434,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #define ASM_CALL(A) \ thread_for( sss, Nsite, { \ - int ss = st.lo->Reorder(sss); \ + int ss = sss; /*st.lo->Reorder(sss);*/ \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ diff --git a/Grid/qcd/representations/adjoint.h b/Grid/qcd/representations/adjoint.h index ee54b465..8d7e9e3c 100644 --- a/Grid/qcd/representations/adjoint.h +++ b/Grid/qcd/representations/adjoint.h @@ -40,7 +40,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> ta(Dimension); + std::vector::Matrix> ta(Dimension); // Debug lines // LatticeMatrix uno(Uin.Grid()); diff --git a/Grid/qcd/representations/two_index.h b/Grid/qcd/representations/two_index.h index 24d6d7cb..c9c1db94 100644 --- a/Grid/qcd/representations/two_index.h +++ b/Grid/qcd/representations/two_index.h @@ -43,7 +43,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> eij(Dimension); + std::vector::Matrix> eij(Dimension); for (int a = 0; a < Dimension; a++) GaugeGroupTwoIndex::base(a, eij[a]); diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index b63d8571..a81ebe6c 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -158,12 +158,12 @@ void A2Autils::MesonField(TensorType &mat, int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for( r, MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -346,12 +346,12 @@ void A2Autils::PionFieldXX(Eigen::Tensor &mat, int MFrvol = rd*Lblock*Rblock; int MFlvol = ld*Lblock*Rblock; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -493,12 +493,12 @@ void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -700,13 +700,13 @@ void A2Autils::AslashField(TensorType &mat, int MFrvol = rd*Lblock*Rblock*Nem; int MFlvol = ld*Lblock*Rblock*Nem; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol, { lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol, { lsSum[r] = scalar_type(0.0); diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 9d9cb508..9a1d312b 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -971,7 +971,9 @@ void BaryonUtils::BaryonGamma3pt( autoView( vq_ti , q_ti , AcceleratorRead); autoView( vq_tf , q_tf , AcceleratorRead); - Vector my_Dq_spec{Dq_spec1,Dq_spec2}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dq_spec1); + acceleratorPut(my_Dq_spec[1],Dq_spec2); mobj * Dq_spec_p = &my_Dq_spec[0]; if (group == 1) { @@ -1300,7 +1302,8 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1353,7 +1356,8 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, autoView( vd_tf , qd_tf , AcceleratorRead ); autoView( vs_ti , qs_ti , AcceleratorRead ); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1544,7 +1548,9 @@ void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Dd_spec,Ds_spec}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dd_spec); + acceleratorPut(my_Dq_spec[0],Ds_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h index 84c7278c..cfc48bbf 100644 --- a/Grid/qcd/utils/SUnAdjoint.h +++ b/Grid/qcd/utils/SUnAdjoint.h @@ -62,7 +62,7 @@ public: // returns i(T_Adj)^index necessary for the projectors // see definitions above iAdjTa = Zero(); - Vector > ta(ncolour * ncolour - 1); + iSUnMatrix ta[ncolour * ncolour - 1]; iSUnMatrix tmp; // FIXME not very efficient to get all the generators everytime diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index b6848977..66d25bc4 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -72,7 +72,7 @@ public: } // Resident in managed memory - Vector _entries; + deviceVector _entries; GeneralLocalStencil(GridBase *grid, const std::vector &shifts) { @@ -141,7 +141,7 @@ public: //////////////////////////////////////////////// // Store in look up table //////////////////////////////////////////////// - this->_entries[lex] = SE; + acceleratorPut(this->_entries[lex],SE); } }); } diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index dabd70a6..eca9cd3c 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -19,7 +19,7 @@ public: static int PartialCompressionFactor(GridBase *grid) {return 1;}; // Decompress is after merge so ok template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -35,7 +35,7 @@ public: rhs_v.ViewClose(); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -83,25 +83,6 @@ public: // Wilson compressor will add alternate policies for Dirichlet // and possibly partial Dirichlet for DWF //////////////////////////////////// -/* -class FaceGatherDirichlet -{ - // If it's dirichlet we don't assemble comms buffers - // - // Rely on zeroes in gauge field to drive the correct result - // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute - template - static void Gather_plane_simple (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so){}; - template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) {} - template - static void Merge(decompressor decompress,Merge &mm) { } - template - static void Decompress(decompressor decompress,Decompression &dd) {} -}; -*/ template class SimpleCompressorGather : public FaceGather { diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 80acb4ae..0918df8e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -31,7 +31,6 @@ #define STENCIL_MAX (16) #include // subdir aggregate -#include // subdir aggregate #include ////////////////////////////////////////////////////////////////////////////////////////// @@ -256,7 +255,6 @@ protected: GridBase * _grid; public: GridBase *Grid(void) const { return _grid; } - LebesgueOrder *lo; //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory @@ -273,11 +271,11 @@ public: int face_table_computed; int partialDirichlet; int fullDirichlet; - std::vector > > face_table ; - Vector surface_list; + std::vector > > face_table ; + deviceVector surface_list; - stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in device memory + std::vector _entries; // Resident in host memory + deviceVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -370,7 +368,6 @@ public: // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. -#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -379,23 +376,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } -#else -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } -#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iStencilBarrier(); -#ifndef ACCELERATOR_AWARE_MPI -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferFreeAll(); -#endif // run any checksums for(int i=0;i_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + int32_t surface_list_size=0; for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ @@ -678,11 +649,28 @@ public: } if(local == 0) { for(int s=0;s_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + for(int s=0;s 0); - } -#endif - if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ - arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); - GridCmdOptionIntVector(arg,LebesgueOrder::Block); - } if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){ GridLogTimestamp(0); } else { diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2b1f6261..c42136b6 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -644,11 +644,6 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); -#ifdef KNL - LebesgueOrder::Block = std::vector({8,2,2,2}); -#else - LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif Benchmark::Decomposition(); int do_su4=1; diff --git a/benchmarks/Benchmark_memory_asynch.cc b/benchmarks/Benchmark_memory_asynch.cc index 97825144..4c27fc2c 100644 --- a/benchmarks/Benchmark_memory_asynch.cc +++ b/benchmarks/Benchmark_memory_asynch.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(std::vector({56,17,89,101})); std::vector stop(threads); - Vector sum(threads); + std::vector sum(threads); std::vector x(threads,&Grid); for(int t=0;t diag = Dw.bs; - Vector upper= Dw.cs; - Vector lower= Dw.cs; + std::vector diag = Dw.bs; + std::vector upper= Dw.cs; + std::vector lower= Dw.cs; upper[Ls-1]=-Dw.mass_minus*upper[Ls-1]; lower[0] =-Dw.mass_plus*lower[0]; diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index 870cb6ec..d2bbf769 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -861,7 +861,7 @@ int main (int argc, char ** argv) } CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); - LebesgueOrder::Block = std::vector({2,2,2,2}); + // LebesgueOrder::Block = std::vector({2,2,2,2}); Benchmark::Decomposition(); diff --git a/configure.ac b/configure.ac index 8e8d67af..652944f9 100644 --- a/configure.ac +++ b/configure.ac @@ -225,18 +225,6 @@ case ${ac_SFW_FP16} in AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac -############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons -AC_ARG_ENABLE([accelerator-aware-mpi], - [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], - [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) - -case ${ac_ACCELERATOR_AWARE_MPI} in - yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) - AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; - *);; -esac - ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], @@ -664,16 +652,6 @@ case ${ac_SHM_FAST_PATH} in *) ;; esac -############### communication type selection -AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) - -case ${ac_COMMS_THREADS} in - yes) - AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) - ;; - *) ;; -esac - ############### communication type selection AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs index b53327f0..a202b587 100644 --- a/systems/Aurora/benchmarks/bench1.pbs +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -q debug +#PBS -q EarlyAppAccess #PBS -l select=1 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA @@ -44,7 +44,7 @@ CMD="mpiexec -np 1 -ppn 1 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -#$CMD | tee 1tile.dwf +$CMD | tee 1tile.dwf CMD="mpiexec -np 12 -ppn 12 -envall \ ./gpu_tile_compact.sh \ diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index ea469cda..ce477319 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -q workq +#PBS -q EarlyAppAccess #PBS -l select=2 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA @@ -43,13 +43,13 @@ $CMD | tee 2node.comms CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " $CMD | tee 2node.32.32.64.48.dwf CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " $CMD | tee 2node.64.64.64.96.dwf diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7abe667f..8ccba356 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,40 +1,12 @@ +module load oneapi/release/2023.12.15.001 +#module load intel_compute_runtime/release/821.35 source ~/spack/share/spack/setup-env.sh spack load c-lime +spack load openssl export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` -#spack load libefence -#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH -#spack load gperftools -export TCMALLOC=/home/paboyle/gperftools/install -export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH -export INTELGT_AUTO_ATTACH_DISABLE=1 - -#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -#module load oneapi/release/2023.12.15.001 -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -#export FI_CXI_DEFAULT_CQ_SIZE=131072 -#export FI_CXI_CQ_FILL_PERCENT=20 -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# -# -ftarget-register-alloc-mode=pvc:default -# -ftarget-register-alloc-mode=pvc:small -# -ftarget-register-alloc-mode=pvc:large -# -ftarget-register-alloc-mode=pvc:auto -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 - export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128 - -#source ~/spack/share/spack/setup-env.sh -#spack load gperftools -#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH - export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs index 205fefce..721b4707 100644 --- a/systems/Aurora/tests/reproBigJob.pbs +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -l select=16 +#PBS -l select=32 #PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 @@ -15,7 +15,7 @@ # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 -export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 @@ -24,14 +24,14 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export GRID_PRINT_ENTIRE_LOG=0 -export GRID_CHECKSUM_RECV_BUF=0 -export GRID_CHECKSUM_SEND_BUF=0 +export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_SEND_BUF=1 export MPICH_OFI_NIC_POLICY=GPU @@ -51,10 +51,10 @@ cd $DIR cp $PBS_NODEFILE nodefile -CMD="mpiexec -np 192 -ppn 12 -envall --hostfile nodefile \ +CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \ ../gpu_tile_compact.sh \ - ../Test_dwf_mixedcg_prec --mpi 4.4.4.3 --grid 128.128.128.96 \ - --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + ../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96 \ + --shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals" echo $CMD > command-line env > environment diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 212b1a35..e60d3555 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -88,6 +88,7 @@ int main (int argc, char ** argv) Ctilde=C; std::cout<<" Benchmarking FFT of LatticeComplex "< testAlgebra; @@ -148,11 +149,12 @@ void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng) test(m*g, m*testg); std::cout << std::endl; } +#endif int main(int argc, char *argv[]) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -170,7 +172,7 @@ int main(int argc, char *argv[]) checkSigma(i, sRNG); } std::cout << GridLogMessage << std::endl; - +#endif Grid_finalize(); return EXIT_SUCCESS; diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index c8587435..0f3c8aad 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -35,7 +35,7 @@ using namespace Grid; int main (int argc, char ** argv) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -216,6 +216,6 @@ int main (int argc, char ** argv) std::cout<oSites(),1,{ - assert(B[v]==A_v[ss]()()().getlane(0)); + // assert(B[v]==A_v[ss]()()().getlane(0)); }); // std::cout << "["< inline void sliceSumCPU(const Grid::Lattice &Data,std int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node diff --git a/tests/sp2n/Test_2as_base.cc b/tests/sp2n/Test_2as_base.cc index 62e86609..3aeccae0 100644 --- a/tests/sp2n/Test_2as_base.cc +++ b/tests/sp2n/Test_2as_base.cc @@ -87,8 +87,8 @@ static void run_generators_checks() { typedef typename Sp_TwoIndex::template iGroupMatrix Matrix; int sum = 0; int sum_im = 0; - Vector ta_fund(this_algebra_dim); - Vector eij(this_irrep_dim); + std::vector ta_fund(this_algebra_dim); + std::vector eij(this_irrep_dim); Matrix tmp_l; Matrix tmp_r; for (int n = 0; n < this_algebra_dim; n++) From e637fbacae1c4f256f00531faf67cdad92027b0c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Sep 2024 09:42:43 +0000 Subject: [PATCH 07/50] Verbose remove --- Grid/cshift/Cshift_mpi.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 91369b74..3a4e0361 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -55,13 +55,13 @@ template Lattice Cshift(const Lattice &rhs,int dimension RealD t1,t0; t0=usecond(); if ( !comm_dim ) { - std::cout << "CSHIFT: Cshift_local" < void Cshift_comms(Lattice& ret,const Lattice &r sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - std::cout << "Cshift_comms dim "< void Cshift_comms_simd(Lattice& ret,const LatticeCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - std::cout << "Cshift_comms_simd dim "< void Cshift_comms_simd(Lattice &ret,const Lattice_simd_layout[dimension]; int comm_dim = grid->_processors[dimension] >1 ; - std::cout << "Cshift_comms_simd dim "<< dimension << " fd "< Date: Mon, 23 Sep 2024 09:43:50 +0000 Subject: [PATCH 08/50] Almost working on Aurora --- .../WilsonFermion5DImplementation.h | 22 +++---- Grid/stencil/Stencil.h | 17 +++--- Grid/util/Init.cc | 42 +++++++++++-- systems/Aurora/benchmarks/bench1.pbs | 61 +++++-------------- systems/Aurora/benchmarks/bench2.pbs | 27 ++++---- systems/Aurora/sourceme.sh | 2 + systems/Aurora/tests/reproBigJob.pbs | 45 ++++++++------ 7 files changed, 118 insertions(+), 98 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2ad48926..32e8108e 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -325,12 +325,12 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// { - std::cout << " WilsonFermion5D gather " < > requests; auto id=traceStart("Communicate overlapped"); st.CommunicateBegin(requests); @@ -339,7 +339,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // Overlap with comms ///////////////////////////// { - std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, ///////////////////////////// // do the compute interior ///////////////////////////// - std::cout << " WilsonFermion5D Interior " <::DhopInternalOverlappedComms(StencilImpl & st, ///////////////////////////// // Complete comms ///////////////////////////// - std::cout << " WilsonFermion5D Comms Complete " <::DhopInternalOverlappedComms(StencilImpl & st, // do the compute exterior ///////////////////////////// { - std::cout << " WilsonFermion5D Comms Merge " <::DhopInternalOverlappedComms(StencilImpl & st, GRID_TRACE("DhopExterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); } - std::cout << " WilsonFermion5D Done " <::DhopInternalSerialComms(StencilImpl & st, int LLs = in.Grid()->_rdimensions[0]; - std::cout << " WilsonFermion5D Halo exch " <::DhopInternalSerialComms(StencilImpl & st, GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } - std::cout << " WilsonFermion5D Done " < > &reqs) { // All GPU kernel tasks must complete - // accelerator_barrier(); // All kernels should ALREADY be complete - // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + accelerator_barrier(); // All kernels should ALREADY be complete + _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, @@ -390,8 +390,8 @@ public: if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete - // accelerator_barrier(); + acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete + accelerator_barrier(); _grid->StencilBarrier(); // run any checksums for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { - // accelerator_barrier(); + accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -487,6 +487,7 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete + _grid->StencilBarrier();// Synch shared memory on a single nodes face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } @@ -653,7 +654,9 @@ public: } } } + std::cout << "BuildSurfaceList size is "< surface_list_host(surface_list_size); int32_t ss=0; for(int site = 0 ;site< vol4;site++){ int local = 1; @@ -665,12 +668,12 @@ public: if(local == 0) { for(int s=0;ssi_signo); + fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); + fprintf(stderr," code %d\n",si->si_code); + // x86 64bit +#ifdef __linux__ +#ifdef __x86_64__ + ucontext_t * uc= (ucontext_t *)ptr; + struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; + fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); +#endif +#endif + fflush(stderr); + BACKTRACEFP(stderr); + fprintf(stderr,"Called backtrace\n"); + fflush(stdout); + fflush(stderr); + return; +} + void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { + fprintf(stderr,"Signal handler on host %s\n",hostname); fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," code %d\n",si->si_code); @@ -561,7 +584,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) ucontext_t * uc= (ucontext_t *)ptr; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); -#define REG(A) printf(" %s %lx\n",#A,sc-> A); +#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A); REG(rdi); REG(rsi); REG(rbp); @@ -594,8 +617,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_exit_handler(void) { - BACKTRACEFP(stdout); - fflush(stdout); + // BACKTRACEFP(stdout); + // fflush(stdout); } void Grid_debug_handler_init(void) { @@ -603,10 +626,10 @@ void Grid_debug_handler_init(void) sigemptyset (&sa.sa_mask); sa.sa_sigaction= Grid_sa_signal_handler; sa.sa_flags = SA_SIGINFO; - sigaction(SIGSEGV,&sa,NULL); + // sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); - sigaction(SIGUSR2,&sa,NULL); + // sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); @@ -614,7 +637,14 @@ void Grid_debug_handler_init(void) sigaction(SIGKILL,&sa,NULL); sigaction(SIGILL,&sa,NULL); - atexit(Grid_exit_handler); + // Non terminating SIGUSR1/2 handler + struct sigaction sa_ping; + sigemptyset (&sa_ping.sa_mask); + sa_ping.sa_sigaction= Grid_usr_signal_handler; + sa_ping.sa_flags = SA_SIGINFO; + sigaction(SIGHUP,&sa_ping,NULL); + + // atexit(Grid_exit_handler); } NAMESPACE_END(Grid); diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs index a202b587..e85dc09e 100644 --- a/systems/Aurora/benchmarks/bench1.pbs +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -5,63 +5,34 @@ #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA -#export OMP_PROC_BIND=spread -#unset OMP_PLACES - cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu -#cat $PBS_NODEFILE +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE -#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - +unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU -# 12 ppn, 2 nodes, 24 ranks -# -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee usqcd.log - - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 1tile.dwf - CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.32.32.32.48.dwf + ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals" - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.64.32.96.dwf - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.32.32.48.dwf +#for f in 1 2 3 4 5 6 7 8 +for f in 1 +do +echo $CMD +$CMD | tee 1node.32.32.64.48.dwf.hbm.$f +done diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index ce477319..4b8eb3fc 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -11,17 +11,16 @@ cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu +#module load pti-gpu -#cat $PBS_NODEFILE + +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 @@ -34,22 +33,26 @@ export MPICH_OFI_NIC_POLICY=GPU # 12 ppn, 2 nodes, 24 ranks # CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ + ./gpu_tile.sh \ ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee 2node.comms +#$CMD | tee 2node.comms.hbm CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 2node.32.32.64.48.dwf + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals" +#for f in 1 2 3 4 5 6 7 8 +for f in 1 +do +echo $CMD +$CMD | tee 2node.32.32.64.48.dwf.hbm.$f +done CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ + ./gpu_tile.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 2node.64.64.64.96.dwf + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 2node.64.64.64.96.dwf.hbm diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 8ccba356..7952a819 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,4 +1,6 @@ module load oneapi/release/2023.12.15.001 +#module load mpich/icc-all-debug-pmix-gpu/52.2 +#module load mpich-config/mode/deterministic #module load intel_compute_runtime/release/821.35 source ~/spack/share/spack/setup-env.sh spack load c-lime diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs index 721b4707..1d880f0d 100644 --- a/systems/Aurora/tests/reproBigJob.pbs +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -15,13 +15,13 @@ # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 -#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 @@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export GRID_PRINT_ENTIRE_LOG=0 -export GRID_CHECKSUM_RECV_BUF=1 -export GRID_CHECKSUM_SEND_BUF=1 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 export MPICH_OFI_NIC_POLICY=GPU -export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 -export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 -export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling -unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE +#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE cd $PBS_O_WORKDIR +cp $PBS_NODEFILE nodefile + DIR=reproBigJob.$PBS_JOBID mkdir -p $DIR @@ -51,10 +53,19 @@ cd $DIR cp $PBS_NODEFILE nodefile +BINARY=../Test_dwf_mixedcg_prec + +echo > pingjob < command-line env > environment From 9fa8bd6438960694cfe44a77bf0bfe0352b9d841 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Sep 2024 11:25:44 +0000 Subject: [PATCH 09/50] Configure for AOT on Aurora latest software --- systems/Aurora-AOT/config-command | 23 ++++++++ systems/Aurora-AOT/sourceme.sh | 15 +++++ systems/Aurora-AOT/tests/reproBigJob.pbs | 74 ++++++++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 systems/Aurora-AOT/config-command create mode 100644 systems/Aurora-AOT/sourceme.sh create mode 100644 systems/Aurora-AOT/tests/reproBigJob.pbs diff --git a/systems/Aurora-AOT/config-command b/systems/Aurora-AOT/config-command new file mode 100644 index 00000000..ead5f7c0 --- /dev/null +++ b/systems/Aurora-AOT/config-command @@ -0,0 +1,23 @@ +#Ahead of time compile for PVC +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " + +#JIT compile +#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " + +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-debug \ + --disable-gparity \ + --disable-fermion-reps \ + --with-lime=$CLIME \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-accelerator-aware-mpi=yes\ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx + diff --git a/systems/Aurora-AOT/sourceme.sh b/systems/Aurora-AOT/sourceme.sh new file mode 100644 index 00000000..4cdc950f --- /dev/null +++ b/systems/Aurora-AOT/sourceme.sh @@ -0,0 +1,15 @@ +#module load oneapi/release/2023.12.15.001 +#module load mpich/icc-all-debug-pmix-gpu/52.2 +#module load mpich-config/mode/deterministic +#module load intel_compute_runtime/release/821.35 + +source ~/spack/share/spack/setup-env.sh +spack load c-lime +spack load openssl +export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/Aurora-AOT/tests/reproBigJob.pbs b/systems/Aurora-AOT/tests/reproBigJob.pbs new file mode 100644 index 00000000..10665df6 --- /dev/null +++ b/systems/Aurora-AOT/tests/reproBigJob.pbs @@ -0,0 +1,74 @@ +#!/bin/bash + +#PBS -l select=512 +#PBS -q EarlyAppAccess +#PBS -A LatticeQCD_aesp_CNDA +#PBS -l walltime=6:00:00 +#PBS -N reproBigJob +#PBS -k doe + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +#module load oneapi/eng-compiler/2023.05.15.003 +#module load mpich/51.2/icc-all-deterministic-pmix-gpu + +# 56 cores / 6 threads ~9 +export OMP_NUM_THREADS=6 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 + +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" + +export GRID_PRINT_ENTIRE_LOG=0 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 + +export MPICH_OFI_NIC_POLICY=GPU + +#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + +cd $PBS_O_WORKDIR + +cp $PBS_NODEFILE nodefile + +DIR=reproBigJob.$PBS_JOBID + +mkdir -p $DIR +cd $DIR + +cp $PBS_NODEFILE nodefile + +BINARY=../Test_dwf_mixedcg_prec + +echo > pingjob < command-line +env > environment +$CMD +grep Oops Grid.stderr.* > failures.$PBS_JOBID +rm core.* From 7dcfb13694b36b8f2b9efab27bb2eafaa4dd7d2c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 21:57:35 +0000 Subject: [PATCH 10/50] New software stack --- Grid/algorithms/blas/BatchedBlas.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index f4092bc5..f4245319 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef cl::sycl::queue *gridblasHandle_t; + typedef sycl::queue *gridblasHandle_t; #endif #ifdef GRID_ONE_MKL - typedef cl::sycl::queue *gridblasHandle_t; + typedef sycl::queue *gridblasHandle_t; #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; @@ -89,9 +89,9 @@ public: gridblasHandle = theGridAccelerator; #endif #ifdef GRID_ONE_MKL - cl::sycl::gpu_selector selector; - cl::sycl::device selectedDevice { selector }; - cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; + sycl::gpu_selector selector; + sycl::device selectedDevice { selector }; + sycl::property_list q_prop{sycl::property::queue::in_order()}; gridblasHandle =new sycl::queue (selectedDevice,q_prop); #endif gridblasInit=1; From 295127d45606897d1a97ec7a73b8ade6031c3f26 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 21:58:26 +0000 Subject: [PATCH 11/50] Deterministic homebrew reduction --- Grid/communicator/Communicator_base.h | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index cd682dd0..3f38edd3 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -128,6 +128,34 @@ public: void GlobalXOR(uint32_t &); void GlobalXOR(uint64_t &); + template void GlobalSumP2P(obj &o) + { + std::vector column; + obj accum = o; + int source,dest; + for(int d=0;d<_ndimension;d++){ + column.resize(_processors[d]); + column[0] = accum; + std::vector list; + for(int p=1;p<_processors[d];p++){ + ShiftedRanks(d,p,source,dest); + SendToRecvFromBegin(list, + &column[0], + dest, + &column[p], + source, + sizeof(obj),d*100+p); + + } + CommsComplete(list); + for(int p=1;p<_processors[d];p++){ + accum = accum + column[p]; + } + } + Broadcast(0,accum); + o=accum; + } + template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); From 2b5fdcbbc5f4a64647f0bf7c6a1df85d4e2e0372 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 21:59:02 +0000 Subject: [PATCH 12/50] New software version --- Grid/communicator/SharedMemoryMPI.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 2600ce9c..ec6a5003 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) #ifdef GRID_SYCL_LEVEL_ZERO_IPC typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; - auto zeDevice = cl::sycl::get_native(theGridAccelerator->get_device()); - auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); + auto zeDevice = sycl::get_native(theGridAccelerator->get_device()); + auto zeContext = sycl::get_native(theGridAccelerator->get_context()); ze_ipc_mem_handle_t ihandle; clone_mem_t handle; From beb0e474ee6862baa3052d368ce1f9c50ce33827 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 22:01:24 +0000 Subject: [PATCH 13/50] Use deterministic own brand reduction --- Grid/lattice/Lattice_reduction.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 92eb0562..0bd1098e 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -290,8 +290,10 @@ template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); + bool ok; #ifdef GRID_SYCL uint64_t csum=0; + uint64_t csum2=0; if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) { // Hack @@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); uint64_t *base= (uint64_t *)&l_v[0]; csum=svm_xor(base,words); + ok = FlightRecorder::CsumLog(csum); + if ( !ok ) { + csum2=svm_xor(base,words); + std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<GlobalSum(nrm); + ok = FlightRecorder::NormLog(real(nrm)); + if ( !ok ) { + ComplexD nrm2 = rankInnerProduct(left,right); + RealD local2 = real(nrm2); + std::cerr<< " Bad NORM " << local << " recomputed as "<GlobalSumP2P(nrm); + // grid->GlobalSum(nrm); + FlightRecorder::StepLog("Finished global sum"); + // std::cout << " norm "<< nrm << " p2p norm "< Date: Thu, 10 Oct 2024 22:01:57 +0000 Subject: [PATCH 14/50] Better flight logging --- Grid/util/FlightRecorder.cc | 45 +++++++++++++++++++++++++++++++++---- Grid/util/FlightRecorder.h | 7 ++++-- Grid/util/Init.cc | 3 +++ 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 60d18fb6..7bbd4acc 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail; int FlightRecorder::LoggingMode; int FlightRecorder::ChecksumComms; int FlightRecorder::ChecksumCommsSend; +const char * FlightRecorder::StepName; +int32_t FlightRecorder::StepLoggingCounter; int32_t FlightRecorder::XmitLoggingCounter; int32_t FlightRecorder::RecvLoggingCounter; int32_t FlightRecorder::CsumLoggingCounter; @@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void) CsumLoggingCounter=0; NormLoggingCounter=0; ReductionLoggingCounter=0; + StepName = "No steps started"; + StepLoggingCounter=0; } void FlightRecorder::Truncate(void) { @@ -88,6 +92,11 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) assert(0); } } +bool FlightRecorder::StepLog(const char *name) +{ + StepName = name; + StepLoggingCounter ++; +} void FlightRecorder::SetLoggingModePrint(void) { @@ -111,17 +120,19 @@ uint64_t FlightRecorder::ErrorCount(void) { return ErrorCounter; } -void FlightRecorder::NormLog(double value) +bool FlightRecorder::NormLog(double value) { uint64_t hex = * ( (uint64_t *)&value ); if(LoggingMode == LoggingModePrint) { std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<si_signo); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," code %d\n",si->si_code); From 68f112d576a53c404f55ea34cbad46ce8efbab57 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 22:03:04 +0000 Subject: [PATCH 15/50] New software moves cl::sycl --- Grid/lattice/Lattice_reduction_sycl.h | 16 ++++++++-------- Grid/lattice/Lattice_slicesum_core.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 3718e6ea..bc9257b9 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os Integer nsimd= vobj::Nsimd(); { sycl::buffer abuff(&ret, {1}); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{osites}, + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); + cgh.parallel_for(sycl::range<1>{osites}, Reduction, - [=] (cl::sycl::id<1> item, auto &sum) { + [=] (sycl::id<1> item, auto &sum) { auto osite = item[0]; sum +=Reduce(lat[osite]); }); @@ -75,11 +75,11 @@ template Word svm_xor(Word *vec,uint64_t L) Word ret = 0; { sycl::buffer abuff(&ret, {1}); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); - cgh.parallel_for(cl::sycl::range<1>{L}, + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); + cgh.parallel_for(sycl::range<1>{L}, Reduction, - [=] (cl::sycl::id<1> index, auto &sum) { + [=] (sycl::id<1> index, auto &sum) { sum ^=vec[index]; }); }); diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index f01ba73d..e15055a6 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data, }); for (int r = 0; r < rd; r++) { - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + theGridAccelerator->submit([&](sycl::handler &cgh) { + auto Reduction = sycl::reduction(&mysum[r],std::plus<>()); + cgh.parallel_for(sycl::range<1>{subvol_size}, Reduction, - [=](cl::sycl::id<1> item, auto &sum) { + [=](sycl::id<1> item, auto &sum) { auto s = item[0]; sum += rb_p[r*subvol_size+s]; }); From be7a543e2c6f9e19707217d62b02052e4ded4fcf Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 22:03:29 +0000 Subject: [PATCH 16/50] Revert barriers -- these were not the problem --- Grid/stencil/Stencil.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1db03813..7ace084c 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -364,9 +364,10 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + FlightRecorder::StepLog("Communicate begin"); // All GPU kernel tasks must complete - accelerator_barrier(); // All kernels should ALREADY be complete - _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, @@ -386,18 +387,20 @@ public: void CommunicateComplete(std::vector > &reqs) { + FlightRecorder::StepLog("Start communicate complete"); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete - accelerator_barrier(); + // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete + // accelerator_barrier(); _grid->StencilBarrier(); // run any checksums for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { - accelerator_barrier(); + // accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -487,7 +490,7 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete - _grid->StencilBarrier();// Synch shared memory on a single nodes + // _grid->StencilBarrier();// Synch shared memory on a single nodes face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } From c5c67b706ecfd06879d678bde5f8b66be93ab28c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Oct 2024 22:04:12 +0000 Subject: [PATCH 17/50] cl::sycl -> SYCL --- Grid/threads/Accelerator.cc | 14 +++++++------- Grid/threads/Accelerator.h | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index fa11dd5f..74d1f585 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -202,13 +202,13 @@ void acceleratorInit(void) #ifdef GRID_SYCL -cl::sycl::queue *theGridAccelerator; -cl::sycl::queue *theCopyAccelerator; +sycl::queue *theGridAccelerator; +sycl::queue *theCopyAccelerator; void acceleratorInit(void) { int nDevices = 1; - // cl::sycl::gpu_selector selector; - // cl::sycl::device selectedDevice { selector }; + // sycl::gpu_selector selector; + // sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. @@ -242,14 +242,14 @@ void acceleratorInit(void) gethostname(hostname, HOST_NAME_MAX+1); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); - auto devices = cl::sycl::device::get_devices(); + auto devices = sycl::device::get_devices(); for(int d = 0;d().c_str()); + printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info().c_str()); #define GPU_PROP_FMT(prop,FMT) \ - printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); + printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); if ( world_rank == 0) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1cb56ddd..e37b5fb7 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -302,7 +302,7 @@ NAMESPACE_END(Grid); // Force deterministic reductions #define SYCL_REDUCTION_DETERMINISTIC -#include +#include #include #include #include @@ -314,8 +314,8 @@ inline void acceleratorMem(void) std::cout <<" SYCL acceleratorMem not implemented"<>()[2]; + return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2]; #else return 0; #endif } // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ + theGridAccelerator->submit([&](sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ if(nt < 8)nt=8; \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ - cl::sycl::range<3> local {nt,1,nsimd}; \ - cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ + sycl::range<3> local {nt,1,nsimd}; \ + sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + sycl::nd_range<3>(global,local), \ + [=] (sycl::nd_item<3> item) /*mutable*/ \ [[intel::reqd_sub_group_size(16)]] \ { \ auto iter1 = item.get_global_id(0); \ @@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele inline int acceleratorIsCommunicable(void *ptr) { #if 0 - auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); - if ( uvm = cl::sycl::usm::alloc::shared ) return 1; + auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); + if ( uvm = sycl::usm::alloc::shared ) return 1; else return 0; #endif return 1; From fd58f0b669df84f9a77124e650b40d3e9ca1f5b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 11 Oct 2024 03:21:21 +0000 Subject: [PATCH 18/50] Return ok --- Grid/util/FlightRecorder.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index 7bbd4acc..c19d3dbb 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -96,6 +96,7 @@ bool FlightRecorder::StepLog(const char *name) { StepName = name; StepLoggingCounter ++; + return true; } void FlightRecorder::SetLoggingModePrint(void) From 54f19990308e284d007f901e6d926ee87abb45cc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 11 Oct 2024 03:22:18 +0000 Subject: [PATCH 19/50] axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues --- Grid/lattice/Lattice_reduction.h | 38 +++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 0bd1098e..dffcae92 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -375,8 +375,44 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(z_v[ss],tmp); }); + bool ok; + uint64_t csum=0; + uint64_t csum2=0; +#ifdef GRID_SYCL + if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) + { + // z_v + { + Integer words = sites*sizeof(vobj)/sizeof(uint64_t); + uint64_t *base= (uint64_t *)&z_v[0]; + csum=svm_xor(base,words); + ok = FlightRecorder::CsumLog(csum); + if ( !ok ) { + csum2=svm_xor(base,words); + std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<GlobalSum(nrm); + ok = FlightRecorder::NormLog(real(nrm)); + assert(ok); + RealD local = real(nrm); + grid->GlobalSumP2P(nrm); + FlightRecorder::ReductionLog(local,real(nrm)); return nrm; } From b728af903c4add9f92ad970b561f86ca3156752a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 11 Oct 2024 03:23:09 +0000 Subject: [PATCH 20/50] Fast axpy norm under CFLAG --- Grid/lattice/Lattice_arith.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 5b37532f..f40d23da 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -257,17 +257,30 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice }); } +#define FAST_AXPY_NORM template inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y) { GRID_TRACE("axpy_norm"); - return axpy_norm_fast(ret,a,x,y); +#ifdef FAST_AXPY_NORM + return axpy_norm_fast(ret,a,x,y); +#else + ret = a*x+y; + RealD nn=norm2(ret); + return nn; +#endif } template inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y) { GRID_TRACE("axpby_norm"); - return axpby_norm_fast(ret,a,b,x,y); +#ifdef FAST_AXPY_NORM + return axpby_norm_fast(ret,a,b,x,y); +#else + ret = a*x+b*y; + RealD nn=norm2(ret); + return nn; +#endif } /// Trace product From 5ec879860a9c4d4d9ec7aa4e3e4871f55887667e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Oct 2024 14:30:54 +0000 Subject: [PATCH 21/50] Odd rounding issue - bears looking into --- Grid/algorithms/iterative/ConjugateGradientMixedPrec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 27fee791..c434b9ef 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid); //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector - + std::cout< Date: Tue, 15 Oct 2024 14:32:11 +0000 Subject: [PATCH 22/50] Use normal reduction, configure flag to force deterministic --- Grid/lattice/Lattice_reduction.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index dffcae92..c7da2945 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ assert(ok); } FlightRecorder::StepLog("Start global sum"); - grid->GlobalSumP2P(nrm); - // grid->GlobalSum(nrm); + // grid->GlobalSumP2P(nrm); + grid->GlobalSum(nrm); FlightRecorder::StepLog("Finished global sum"); // std::cout << " norm "<< nrm << " p2p norm "< &z,sobj a,sobj b,const Lattice &x,const Latt ok = FlightRecorder::NormLog(real(nrm)); assert(ok); RealD local = real(nrm); - grid->GlobalSumP2P(nrm); + grid->GlobalSum(nrm); FlightRecorder::ReductionLog(local,real(nrm)); return nrm; } From febfe4e77f885dc0ab26e2b6200ebdf6b24a0367 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Oct 2024 14:32:35 +0000 Subject: [PATCH 23/50] Make my own reduction a configure flag --- Grid/communicator/Communicator_base.cc | 19 +++++++++++++---- Grid/communicator/Communicator_mpi3.cc | 28 +++++++++++++++++--------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc index 79efb90c..f9a4c442 100644 --- a/Grid/communicator/Communicator_base.cc +++ b/Grid/communicator/Communicator_base.cc @@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return // very VERY rarely (Log, serial RNG) we need world without a grid //////////////////////////////////////////////////////////////////////////////// +#ifdef USE_GRID_REDUCTION +void CartesianCommunicator::GlobalSum(ComplexF &c) +{ + GlobalSumP2P(c); +} +void CartesianCommunicator::GlobalSum(ComplexD &c) +{ + GlobalSumP2P(c); +} +#else void CartesianCommunicator::GlobalSum(ComplexF &c) { GlobalSumVector((float *)&c,2); } -void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) -{ - GlobalSumVector((float *)c,2*N); -} void CartesianCommunicator::GlobalSum(ComplexD &c) { GlobalSumVector((double *)&c,2); } +#endif +void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) +{ + GlobalSumVector((float *)c,2*N); +} void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) { GlobalSumVector((double *)c,2*N); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 5fa70da4..192bb339 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator() } } } +#ifdef USE_GRID_REDUCTION +void CartesianCommunicator::GlobalSum(float &f){ + CartesianCommunicator::GlobalSumP2P(f); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + CartesianCommunicator::GlobalSumP2P(d); +} +#else +void CartesianCommunicator::GlobalSum(float &f){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +#endif void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); @@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d) int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); assert(ierr==0); } -void CartesianCommunicator::GlobalSum(float &f){ - int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); - assert(ierr==0); -} void CartesianCommunicator::GlobalSumVector(float *f,int N) { int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); } -void CartesianCommunicator::GlobalSum(double &d) -{ - int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); - assert(ierr==0); -} void CartesianCommunicator::GlobalSumVector(double *d,int N) { int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); From 03687c1d624aef21813db68ea6256ae88049be99 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Oct 2024 14:35:17 +0000 Subject: [PATCH 24/50] Final version of test, closer to original again --- tests/Test_dwf_mixedcg_prec.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index f37696a8..97bf5143 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -124,6 +124,8 @@ int main (int argc, char ** argv) SchurDiagMooeeOperatorParanoid HermOpEO(Ddwf); SchurDiagMooeeOperatorParanoid HermOpEO_f(Ddwf_f); + // SchurDiagMooeeOperator HermOpEO(Ddwf); + // SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); int nsecs=600; if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ @@ -131,6 +133,10 @@ int main (int argc, char ** argv) GridCmdOptionInt(arg,nsecs); } + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl; + UGrid->Barrier(); + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl; + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); @@ -148,7 +154,7 @@ int main (int argc, char ** argv) FlightRecorder::ContinueOnFail = 0; FlightRecorder::PrintEntireLog = 0; - FlightRecorder::ChecksumComms = 1; + FlightRecorder::ChecksumComms = 0; FlightRecorder::ChecksumCommsSend=0; if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); @@ -180,7 +186,7 @@ int main (int argc, char ** argv) iter ++; now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); } while (now < (start + nsecs/10) ); - + std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; ConjugateGradient CG(1.0e-8,10000); int i=0; From 2eff3f34edbbef34f12bf8361b9e0075443b28ba Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Oct 2024 14:36:06 +0000 Subject: [PATCH 25/50] Alternate reduction; default to grids own but make a configure flag --enable-reduction=grid|mpi --- configure.ac | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/configure.ac b/configure.ac index 652944f9..776ca264 100644 --- a/configure.ac +++ b/configure.ac @@ -128,6 +128,20 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; esac +############### internal reduction +AC_ARG_ENABLE([reduction], + [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])], + [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid]) + +case ${ac_REDUCTION} in + mpi) + ;; + grid) + AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; + *) + AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; +esac + ############### tracing AC_ARG_ENABLE([tracing], [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], From a78a61d76f732ea7a804e527f17e75504655ba92 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Oct 2024 14:38:45 +0000 Subject: [PATCH 26/50] Update configure --- systems/Aurora/config-command | 10 ++++++++-- systems/Aurora/sourceme.sh | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 5b4e378c..423da64e 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,8 +1,14 @@ +#Ahead of time compile for PVC +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " + +#JIT compile +#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " +#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions " ../../configure \ --enable-simd=GPU \ + --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ --enable-debug \ diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7952a819..4cdc950f 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,7 +1,8 @@ -module load oneapi/release/2023.12.15.001 +#module load oneapi/release/2023.12.15.001 #module load mpich/icc-all-debug-pmix-gpu/52.2 #module load mpich-config/mode/deterministic #module load intel_compute_runtime/release/821.35 + source ~/spack/share/spack/setup-env.sh spack load c-lime spack load openssl From 6815e138b41b9832167b6eb05759a6b145d3db1a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 17 Oct 2024 18:36:32 +0100 Subject: [PATCH 27/50] Boosted fermion attempt --- .../fermion/ContinuedFractionFermion5D.h | 44 ++++ .../action/fermion/PartialFractionFermion5D.h | 53 ++++- ...ContinuedFractionFermion5DImplementation.h | 16 +- .../PartialFractionFermion5DImplementation.h | 86 +++++++- tests/qdpxx/Test_qdpxx_munprec.cc | 188 +++++++++++++++--- 5 files changed, 344 insertions(+), 43 deletions(-) diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 2300afd3..27931f91 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -60,6 +60,50 @@ public: // virtual void Instantiatable(void)=0; virtual void Instantiatable(void) =0; + void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary, std::vector twist) + { + std::cout << "Free Propagator for PartialFraction"<_fdimensions[nu+shift]))); + //momenta for propagator shifted by twist+boundary + twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); + } + in_buf = exp(ci*ph*(-1.0))*in; + + theFFT.FFT_all_dim(in_k,in,FFT::forward); + this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); + theFFT.FFT_all_dim(out,prop_k,FFT::backward); + + //phase for boundary condition + out = out * exp(ci*ph); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + std::vector boundary; + for(int i=0;i &out); diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 54f8547f..34aa80b8 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D public: INHERIT_IMPL_TYPES(Impl); - const int part_frac_chroma_convention=1; + const int part_frac_chroma_convention=0; void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag); @@ -83,12 +83,63 @@ public: GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD M5,const ImplParams &p= ImplParams()); + PartialFractionFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass,RealD M5,std::vector &_qmu,const ImplParams &p= ImplParams()); + + void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary, std::vector twist) + { + std::cout << "Free Propagator for PartialFraction"<_fdimensions[nu+shift]))); + //momenta for propagator shifted by twist+boundary + twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); + } + in_buf = exp(ci*ph*(-1.0))*in; + + theFFT.FFT_all_dim(in_k,in,FFT::forward); + this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); + theFFT.FFT_all_dim(out,prop_k,FFT::backward); + + //phase for boundary condition + out = out * exp(ci*ph); + }; + + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { + std::vector twist(Nd,0.0); //default: periodic boundarys in all directions + std::vector boundary; + for(int i=0;i qmu; RealD mass; RealD dw_diag; RealD R; diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index 6687800e..4bfbd31e 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -42,13 +42,13 @@ template void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata) { // How to check Ls matches?? - // std::cout<n << " - n"<da << " -da "<db << " -db"<dn << " -dn"<dd << " -dd"<n << " - n"<da << " -da "<db << " -db"<dn << " -dn"<dd << " -dd"<Ls; + std::cout<db==Ls);// Beta has Ls coeffs R=(1+this->mass)/(1-this->mass); @@ -320,7 +320,7 @@ ContinuedFractionFermion5D::ContinuedFractionFermion5D( int Ls = this->Ls; conformable(solution5d.Grid(),this->FermionGrid()); conformable(exported4d.Grid(),this->GaugeGrid()); - ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); + ExtractSlice(exported4d, solution5d, Ls-1, 0); } template void ContinuedFractionFermion5D::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) @@ -330,7 +330,7 @@ ContinuedFractionFermion5D::ContinuedFractionFermion5D( conformable(input4d.Grid() ,this->GaugeGrid()); FermionField tmp(this->FermionGrid()); tmp=Zero(); - InsertSlice(input4d, tmp, Ls-1, Ls-1); + InsertSlice(input4d, tmp, Ls-1, 0); tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; this->Dminus(tmp,imported5d); } diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 0206828b..cbbda785 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -255,15 +255,76 @@ void PartialFractionFermion5D::M_internal(const FermionField &psi, Fermi } { + // The 'conventional' Cayley overlap operator is + // + // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw + // + // + // With massless limit 1/2(1+g5 sgnHw) + // + // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2) + // + // However, the conventional normalisation has both a leading order factor of 2 in Zq + // at tree level AND a mass dependent (1-m) that are convenient to absorb. + // + // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is + // + // num = -i sin kmu gmu + // + // denom ( sqrt(sk^2 + (2shk^2 - 1)^2 + // b_k = sk2 - M5; + // + // w_k = sqrt(sk + b_k*b_k); + // + // denom= ( w_k + b_k + mass*mass) ; + // + // denom= one/denom; + // out = num*denom; + // + // Chroma, and Grid define partial fraction via 4d operator + // + // Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw + // + // Now since: + // + // (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m) + // + // This corresponds to a modified mass parameter + // + // It has an annoying + // + // double R=(1+this->mass)/(1-this->mass); //R g5 psi[Ls] + p[0] H ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1); - + for(int b=0;b::SetCoefficientsZolotarev(RealD zolo_hi,App int Ls = this->Ls; conformable(solution5d.Grid(),this->FermionGrid()); conformable(exported4d.Grid(),this->GaugeGrid()); - ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); + ExtractSlice(exported4d, solution5d, Ls-1, 0); } template void PartialFractionFermion5D::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) @@ -421,7 +482,8 @@ void PartialFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,App conformable(input4d.Grid() ,this->GaugeGrid()); FermionField tmp(this->FermionGrid()); tmp=Zero(); - InsertSlice(input4d, tmp, Ls-1, Ls-1); + std::cout << " importing to slice " << Ls-1 <Dminus(tmp,imported5d); } @@ -442,7 +504,7 @@ PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, { int Ls = this->Ls; - + qmu.resize(0); assert((Ls&0x1)==1); // Odd Ls required int nrational=Ls-1; @@ -460,6 +522,22 @@ PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, Approx::zolotarev_free(zdata); } +template +PartialFractionFermion5D::PartialFractionFermion5D(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass,RealD M5, + std::vector &_qmu, + const ImplParams &p) + : PartialFractionFermion5D(_Umu, + FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,M5,p) +{ + qmu=_qmu; +} NAMESPACE_END(Grid); diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc index 82874546..c6ce2800 100644 --- a/tests/qdpxx/Test_qdpxx_munprec.cc +++ b/tests/qdpxx/Test_qdpxx_munprec.cc @@ -1,7 +1,6 @@ /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid - Source file: ./tests/qdpxx/Test_qdpxx_munprec.cc Copyright (C) 2015 @@ -26,13 +25,17 @@ Author: paboyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ +#include +#include +#include + #include int Ls=8; double M5=1.6; double mq=0.01; -double zolo_lo = 0.1; -double zolo_hi = 2.0; +double zolo_lo = 0.01; +double zolo_hi = 7.0; double mobius_scale=2.0; enum ChromaAction { @@ -55,11 +58,6 @@ enum ChromaAction { void calc_grid (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag); void calc_chroma (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag); -#include -#include -#include - - namespace Chroma { @@ -81,7 +79,7 @@ public: std::vector x(4); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd = gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0] x(5); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd= gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0] x(5); QDP::multi1d cx(4); - std::vector gd= gr.Grid()->GlobalDimensions(); + Grid::Coordinate gd= gr.Grid()->GlobalDimensions(); for (x[0]=0;x[0]OVEXT_CONSTANT_STRATEGY\n"; +"OVEXT_CONSTANT_STRATEGY1.0\n"; + UnprecOvExtFermActArray S_f(cfs,param); + Handle< FermState > fs( S_f.createState(u) ); + Handle< LinearOperatorArray > M(S_f.linOp(fs)); + return M; + } + if ( parms == HwPartFracTanh ) { + if ( Ls%2 == 0 ) { + printf("Ls is not odd\n"); + exit(-1); + } + UnprecOvExtFermActArrayParams param; + param.OverMass=M5; + param.Mass=_mq; + param.RatPolyDeg = Ls; + param.ApproxMin =eps_lo; + param.ApproxMax =eps_hi; + param.b5 =1.0; + param.c5 =1.0; + // param.approximation_type=COEFF_TYPE_ZOLOTAREV; + param.approximation_type=COEFF_TYPE_TANH_UNSCALED; + //param.approximation_type=COEFF_TYPE_TANH; + param.tuning_strategy_xml= + "OVEXT_CONSTANT_STRATEGY1.0\n"; UnprecOvExtFermActArray S_f(cfs,param); Handle< FermState > fs( S_f.createState(u) ); Handle< LinearOperatorArray > M(S_f.linOp(fs)); @@ -316,7 +337,35 @@ public: param.ApproxMin=eps_lo; param.ApproxMax=eps_hi; param.approximation_type=COEFF_TYPE_ZOLOTAREV; - param.RatPolyDeg=Ls; + param.RatPolyDeg=Ls-1; + // The following is why I think Chroma made some directional errors: + param.AuxFermAct= std::string( +"\n" +" UNPRECONDITIONED_WILSON\n" +" -1.8\n" +" 1\n" +" 0\n" +" 1000\n" +" 1.0e-9\n" +" \n" +" SIMPLE_FERMBC\n" +" 1 1 1 1\n" +" \n" +"" +); + param.AuxFermActGrp= std::string(""); + UnprecOvlapContFrac5DFermActArray S_f(fbc,param); + Handle< FermState > fs( S_f.createState(u) ); + Handle< LinearOperatorArray > M(S_f.linOp(fs)); + return M; + } + if ( parms == HwContFracTanh ) { + UnprecOvlapContFrac5DFermActParams param; + param.Mass=_mq; // How is M5 set? Wilson mass In AuxFermAct + param.ApproxMin=eps_lo; + param.ApproxMax=eps_hi; + param.approximation_type=COEFF_TYPE_TANH_UNSCALED; + param.RatPolyDeg=Ls-1; // The following is why I think Chroma made some directional errors: param.AuxFermAct= std::string( "\n" @@ -378,7 +427,14 @@ int main (int argc,char **argv ) * Setup QDP *********************************************************/ Chroma::initialize(&argc,&argv); - Chroma::WilsonTypeFermActs4DEnv::registerAll(); + // Chroma::WilsonTypeFermActs4DEnv::registerAll(); + Chroma::WilsonTypeFermActsEnv::registerAll(); + //bool linkageHack(void) + //{ + // bool foo = true; + // Inline Measurements + // InlineAggregateEnv::registerAll(); + // GaugeInitEnv::registerAll(); /******************************************************** * Setup Grid @@ -388,26 +444,34 @@ int main (int argc,char **argv ) Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()), Grid::GridDefaultMpi()); - std::vector gd = UGrid->GlobalDimensions(); + Grid::Coordinate gd = UGrid->GlobalDimensions(); QDP::multi1d nrow(QDP::Nd); for(int mu=0;mu<4;mu++) nrow[mu] = gd[mu]; QDP::Layout::setLattSize(nrow); QDP::Layout::create(); - Grid::GridCartesian * FGrid = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); - Grid::LatticeGaugeField lat(UGrid); - Grid::LatticeFermion src(FGrid); - Grid::LatticeFermion res_chroma(FGrid); - Grid::LatticeFermion res_grid (FGrid); - std::vector ActionList({ HtCayleyTanh, // Plain old DWF. HmCayleyTanh, HwCayleyTanh, HtCayleyZolo, // Plain old DWF. HmCayleyZolo, - HwCayleyZolo + HwCayleyZolo, + HwPartFracZolo, + HwContFracZolo, + HwContFracTanh + }); + std::vector LsList({ + 8,//HtCayleyTanh, // Plain old DWF. + 8,//HmCayleyTanh, + 8,//HwCayleyTanh, + 8,//HtCayleyZolo, // Plain old DWF. + 8,//HmCayleyZolo, + 8,//HwCayleyZolo, + 9,//HwPartFracZolo + 9, //HwContFracZolo + 9 //HwContFracTanh }); std::vector ActionName({ "HtCayleyTanh", @@ -415,10 +479,19 @@ int main (int argc,char **argv ) "HwCayleyTanh", "HtCayleyZolo", "HmCayleyZolo", - "HwCayleyZolo" + "HwCayleyZolo", + "HwPartFracZolo", + "HwContFracZolo", + "HwContFracTanh" }); for(int i=0;i::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); /* Grid::LatticeColourMatrix U(UGrid); @@ -519,7 +593,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF if ( action == HtCayleyTanh ) { - Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5); + Grid::DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5); std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "< Date: Fri, 18 Oct 2024 15:42:30 +0000 Subject: [PATCH 28/50] Meson field test --- Grid/lattice/Lattice_reduction.h | 6 + Grid/qcd/utils/A2Autils.h | 201 ++++++++++++++++++++++++------- tests/Test_meson_field.cc | 60 +++++---- 3 files changed, 203 insertions(+), 64 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index c7da2945..837e3bea 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -522,11 +522,14 @@ template inline void sliceSum(const Lattice &Data, int ostride=grid->_ostride[orthogdim]; //Reduce Data down to lvSum + RealD t_sum =-usecond(); sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); + t_sum +=usecond(); // Sum across simd lanes in the plane, breaking out orthog dir. Coordinate icoor(Nd); + RealD t_rest =-usecond(); for(int rt=0;rt inline void sliceSum(const Lattice &Data, scalar_type * ptr = (scalar_type *) &result[0]; int words = fd*sizeof(sobj)/sizeof(scalar_type); grid->GlobalSumVector(ptr, words); + t_rest +=usecond(); + std::cout << GridLogMessage << " sliceSum local"< inline std::vector diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index a81ebe6c..70eaf0ab 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -6,6 +6,34 @@ NAMESPACE_BEGIN(Grid); #undef DELTA_F_EQ_2 +/////////////////////////////////////////////////////////////////// +//Meson +// Interested in +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// +// Conventional meson field: +// +// = sum_x,y Trace[ sum_j G |v_j(y,ty)> +// = sum_ij PI_ji(tx) PI_ij(ty) +// +// G5-Hermiticity +// +// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] +// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] +// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> +// = sum_ij PionVV(ty) PionWW(tx) +// +// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker +// expectation value. Otherwise biased. +//////////////////////////////////////////////////////////////////// + template class A2Autils { @@ -26,7 +54,9 @@ public: typedef iSpinColourMatrix SpinColourMatrix_v; - template // output: rank 5 tensor, e.g. Eigen::Tensor + + // output: rank 5 tensor, e.g. Eigen::Tensor + template static void MesonField(TensorType &mat, const FermionField *lhs_wi, const FermionField *rhs_vj, @@ -34,6 +64,14 @@ public: const std::vector &mom, int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + template + static void MesonFieldGPU(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); + /* static void PionFieldWVmom(Eigen::Tensor &mat, const FermionField *wi, const FermionField *vj, @@ -58,7 +96,8 @@ public: const FermionField *vi, const FermionField *vj, int orthogdim); - + */ + template // output: rank 5 tensor, e.g. Eigen::Tensor static void AslashField(TensorType &mat, const FermionField *lhs_wi, @@ -159,14 +198,14 @@ void A2Autils::MesonField(TensorType &mat, int MFlvol = ld*Lblock*Rblock*Nmom; std::vector lvSum(MFrvol); - thread_for( r, MFrvol,{ + for(int r=0;r lsSum(MFlvol); - thread_for(r,MFlvol,{ + for(int r=0;r_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; @@ -174,7 +213,7 @@ void A2Autils::MesonField(TensorType &mat, // potentially wasting cores here if local time extent too small if (t_kernel) *t_kernel = -usecond(); - thread_for(r,rd,{ + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -213,10 +252,10 @@ void A2Autils::MesonField(TensorType &mat, } } } - }); + }; // Sum across simd lanes in the plane, breaking out orthog dir. - thread_for(rt,rd,{ + for(int rt=0;rt extracted(Nsimd); @@ -241,7 +280,7 @@ void A2Autils::MesonField(TensorType &mat, } }}} - }); + } if (t_kernel) *t_kernel += usecond(); assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Ngamma); @@ -290,35 +329,115 @@ void A2Autils::MesonField(TensorType &mat, if (t_gsum) *t_gsum += usecond(); } +const int A2Ablocking=8; +template using iVecSpinMatrix = iVector, Ns>, A2Ablocking>; +typedef iVecSpinMatrix VecSpinMatrix; +typedef iVecSpinMatrix vVecSpinMatrix; +typedef Lattice LatticeVecSpinMatrix; -/////////////////////////////////////////////////////////////////// -//Meson -// Interested in -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// -// Conventional meson field: -// -// = sum_x,y Trace[ sum_j G |v_j(y,ty)> -// = sum_ij PI_ji(tx) PI_ij(ty) -// -// G5-Hermiticity -// -// sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] -// = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] -// = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> -// = sum_ij PionVV(ty) PionWW(tx) -// -// (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker -// expectation value. Otherwise biased. -//////////////////////////////////////////////////////////////////// +template +template +void A2Autils::MesonFieldGPU(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecSpinMatrix MomSpinMat(grid); + + RealD t_afor = 0.0; + RealD t_sum = 0.0; + RealD t_pha = 0.0; + RealD t_trace= 0.0; + uint64_t ncall=0; + + std::vector sliced; + for(int i=0;i void A2Autils::PionFieldXX(Eigen::Tensor &mat, const FermionField *wi, @@ -645,6 +764,7 @@ void A2Autils::PionFieldVV(Eigen::Tensor &mat, const int nog5=0; PionFieldXX(mat,vi,vj,orthogdim,nog5); } +*/ // "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) // @@ -992,9 +1112,9 @@ typename std::enable_if<(std::is_same, TensorType>::va std::is_same>, TensorType>::value), void>::type A2Autils::ContractWWVV(std::vector &WWVV, - const TensorType &WW_sd, - const FermionField *vs, - const FermionField *vd) + const TensorType &WW_sd, + const FermionField *vs, + const FermionField *vd) { GridBase *grid = vs[0].Grid(); @@ -1062,7 +1182,6 @@ A2Autils::ContractWWVV(std::vector &WWVV, } for (int t = 0; t < N_t; t++){ - std::cout << GridLogMessage << "Contraction t = " << t << std::endl; buf = WW_sd[t]; thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o phi(VDIM,&grid); - std::vector rho(VDIM,&grid); - FermionField rho_tmp(&grid); std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; for (unsigned int i = 0; i < VDIM; ++i){ random(pRNG,phi[i]); - random(pRNG,rho_tmp); //ideally only nonzero on t=0 - rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 } std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; @@ -82,7 +78,7 @@ int main(int argc, char *argv[]) {1.,1.,1.}, {2.,0.,0.} }; - + // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; std::cout << GridLogMessage << "Computing complex phases" << std::endl; @@ -102,28 +98,47 @@ int main(int argc, char *argv[]) std::cout << GridLogMessage << "Computing complex phases done." << std::endl; Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); // timer double start,stop; //execute meson field routine + std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl; + A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); + std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl; start = usecond(); A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); stop = usecond(); std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices) - * to the routine so that all the compnents which are predictably equal to zero are not computed. */ - A2Autils::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl; - start = usecond(); - A2Autils::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp); - stop = usecond(); - std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl; + std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl; + A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); + std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl; + start = usecond(); + A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl; + + for(int mom=0;mom Date: Fri, 18 Oct 2024 13:56:24 -0400 Subject: [PATCH 29/50] Remove the copying version --- Grid/cshift/Cshift_mpi.h | 259 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 246 insertions(+), 13 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 3a4e0361..90052051 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -31,7 +31,7 @@ Author: paboyle NAMESPACE_BEGIN(Grid); - +const int Cshift_verbose=0; template Lattice Cshift(const Lattice &rhs,int dimension,int shift) { typedef typename vobj::vector_type vector_type; @@ -65,10 +65,10 @@ template Lattice Cshift(const Lattice &rhs,int dimension Cshift_comms(ret,rhs,dimension,shift); } t1=usecond(); - // std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"< void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { int sshift[2]; @@ -175,11 +175,13 @@ template void Cshift_comms(Lattice &ret,const Lattice &r tscatter+=usecond(); } } - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -301,12 +303,243 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=rhs.Grid(); + Lattice temp(rhs.Grid()); + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); + vobj *send_buf; + vobj *recv_buf; + { + grid->ShmBufferFreeAll(); + size_t bytes = buffer_size*sizeof(vobj); + send_buf=(vobj *)grid->ShmBufferMalloc(bytes); + recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); + } + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(vobj); + + tgather-=usecond(); + Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + tgather+=usecond(); + + // int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + + tcomms-=usecond(); + // grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + xbytes+=bytes; + acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); + + // grid->Barrier(); + tcomms+=usecond(); + + tscatter-=usecond(); + Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + tscatter+=usecond(); + } + } + if(Cshift_verbose){ + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + GridBase *grid=rhs.Grid(); + const int Nsimd = grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + + int fd = grid->_fdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; + int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; + int simd_layout = grid->_simd_layout[dimension]; + int comm_dim = grid->_processors[dimension] >1 ; + + //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // int words = sizeof(vobj)/sizeof(vector_type); + + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + { + size_t bytes = sizeof(scalar_object)*buffer_size; + grid->ShmBufferFreeAll(); + send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + } + for(int s=0;s pointers(Nsimd); // + ExtractPointerArray rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + int cb = (cbmask==0x2)? Odd : Even; + int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + + assert (sx == nbr_ox); + + if(nbr_proc){ + grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + tcomms-=usecond(); + // grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); + grid->SendToRecvFrom((void *)send_buf_extract_mpi, + xmit_to_rank, + (void *)recv_buf_extract_mpi, + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); + xbytes+=bytes; + + // grid->Barrier(); + tcomms+=usecond(); + rpointers[i] = &recv_buf_extract[i][0]; + } else { + rpointers[i] = &send_buf_extract[nbr_lane][0]; + } + + } + tscatter-=usecond(); + Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); + tscatter+=usecond(); + + } + if(Cshift_verbose){ + std::cout << GridLogPerformance << " Cshift (s) copy "< Date: Fri, 18 Oct 2024 13:56:53 -0400 Subject: [PATCH 30/50] Clean up --- Grid/threads/Accelerator.h | 59 +++++++------------------------------- 1 file changed, 11 insertions(+), 48 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index e37b5fb7..dc68fd2d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -132,27 +132,17 @@ inline void cuda_mem(void) #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } -#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - ProfileLambdaApply<<>>(num1,num2,nsimd,lambda); \ + if ( num1*num2 ) { \ + int nt=acceleratorThreads(); \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ + dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ + LambdaApply<<>>(num1,num2,nsimd,lambda); \ + } \ } #define accelerator_for6dNB(iter1, num1, \ @@ -175,19 +165,6 @@ inline void cuda_mem(void) } -#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ - { \ - int nt=acceleratorThreads(); \ - typedef uint64_t Iterator; \ - auto lambda = [=] accelerator \ - (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ - __VA_ARGS__; \ - }; \ - dim3 cu_threads(nsimd,acceleratorThreads(),1); \ - dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ - LambdaApply<<>>(num1,num2,nsimd,lambda); \ - } - template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { @@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) Lambda(x,y,z); } } -template __global__ -void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) -{ - // Weird permute is to make lane coalesce for large blocks - uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; - uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; - uint64_t z = threadIdx.x; - if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, @@ -523,9 +489,6 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) // FIXME -- the non-blocking nature got broken March 30 2023 by PAB #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); -#define prof_accelerator_for( iter1, num1, nsimd, ... ) \ - prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\ - accelerator_barrier(dummy); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ From 11b07b950d7f87b872ccc74d6808f3a526a4dca4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 18 Oct 2024 13:57:40 -0400 Subject: [PATCH 31/50] Vanilla linux compile, assuming spack prerequisites --- systems/Linux-cuda/config-command | 16 ++++++++++++++++ systems/Linux-cuda/sourceme.sh | 12 ++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 systems/Linux-cuda/config-command create mode 100644 systems/Linux-cuda/sourceme.sh diff --git a/systems/Linux-cuda/config-command b/systems/Linux-cuda/config-command new file mode 100644 index 00000000..ebdb5356 --- /dev/null +++ b/systems/Linux-cuda/config-command @@ -0,0 +1,16 @@ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-shm=nvlink \ + --with-lime=$CLIME \ + --with-gmp=$GMP \ + --with-mpfr=$MPFR \ + --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ + --disable-unified \ + CXX=nvcc \ + LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared" + diff --git a/systems/Linux-cuda/sourceme.sh b/systems/Linux-cuda/sourceme.sh new file mode 100644 index 00000000..fbec47a4 --- /dev/null +++ b/systems/Linux-cuda/sourceme.sh @@ -0,0 +1,12 @@ +. /home/paboyle/spack/share/spack/setup-env.sh +spack load cuda@12.0.0 +spack load c-lime +spack load gmp +spack load mpfr +spack load openmpi +export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-` +export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-` +export GMP=`spack find --paths gmp | grep gmp | cut -c 12-` +export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-` +export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/ +export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH From 955da582b6ed79a2012f90ff2bef2f1d5570bf97 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 18 Oct 2024 13:58:03 -0400 Subject: [PATCH 32/50] Working on NVCC --- Grid/qcd/utils/A2Autils.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index 70eaf0ab..1aeacbf2 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -354,7 +354,7 @@ void A2Autils::MesonFieldGPU(TensorType &mat, int Lblock = mat.dimension(3); int Rblock = mat.dimension(4); - assert(Lblock % block==0); + // assert(Lblock % block==0); // assert(Rblock % block==0); GridBase *grid = lhs_wi[0].Grid(); @@ -387,7 +387,6 @@ void A2Autils::MesonFieldGPU(TensorType &mat, ////////////////////////////////////////// // Should write a SpinOuterColorTrace ////////////////////////////////////////// - std::cout < Date: Fri, 18 Oct 2024 13:58:33 -0400 Subject: [PATCH 33/50] Config command --- systems/Lumi/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command index 5e596285..76854edc 100644 --- a/systems/Lumi/config-command +++ b/systems/Lumi/config-command @@ -1,7 +1,7 @@ spack load c-lime spack load gmp spack load mpfr -CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-` +CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-` GMP=`spack find --paths gmp | grep gmp | cut -c 12-` MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-` echo clime X$CLIME From 5ae77876a81d4164f93565a4f33d4c7ebb84c0b1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 18 Oct 2024 19:08:06 -0400 Subject: [PATCH 34/50] Meson field and Aslash field on GPU; some compiler warning removed --- Grid/allocator/MemoryManagerCache.cc | 1 - Grid/lattice/Lattice_reduction.h | 8 +- Grid/qcd/action/ActionBase.h | 6 +- Grid/qcd/action/gauge/WilsonGaugeAction.h | 5 + Grid/qcd/utils/A2Autils.h | 1263 +++++++++++++-------- Grid/util/FlightRecorder.cc | 7 +- benchmarks/Benchmark_usqcd.cc | 4 +- examples/Example_taku.cc | 383 ------- examples/Example_taku1.cc | 479 -------- examples/Example_taku2.cc | 433 ------- systems/Linux-cuda/config-command | 2 + systems/Linux-cuda/sourceme.sh | 6 +- tests/Test_meson_field.cc | 59 +- 13 files changed, 812 insertions(+), 1844 deletions(-) delete mode 100644 examples/Example_taku.cc delete mode 100644 examples/Example_taku1.cc delete mode 100644 examples/Example_taku2.cc diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index c610fb9c..e5cc0c42 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,7 +1,6 @@ #include #ifndef GRID_UVM -#warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); #define MAXLINE 512 diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 837e3bea..53a592d1 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -376,9 +376,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt coalescedWrite(z_v[ss],tmp); }); bool ok; +#ifdef GRID_SYCL uint64_t csum=0; uint64_t csum2=0; -#ifdef GRID_SYCL if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) { // z_v @@ -522,14 +522,11 @@ template inline void sliceSum(const Lattice &Data, int ostride=grid->_ostride[orthogdim]; //Reduce Data down to lvSum - RealD t_sum =-usecond(); sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); - t_sum +=usecond(); // Sum across simd lanes in the plane, breaking out orthog dir. Coordinate icoor(Nd); - RealD t_rest =-usecond(); for(int rt=0;rt inline void sliceSum(const Lattice &Data, scalar_type * ptr = (scalar_type *) &result[0]; int words = fd*sizeof(sobj)/sizeof(scalar_type); grid->GlobalSumVector(ptr, words); - t_rest +=usecond(); - std::cout << GridLogMessage << " sliceSum local"< inline diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index 8acae81b..c3a46729 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -98,7 +98,7 @@ public: virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative - + ///////////////////////////////////////////////////////////// // virtual smeared interface through configuration container ///////////////////////////////////////////////////////////// @@ -132,6 +132,10 @@ public: template class EmptyAction : public Action { + using Action::refresh; + using Action::Sinitial; + using Action::deriv; + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h index f535b54f..22c792cc 100644 --- a/Grid/qcd/action/gauge/WilsonGaugeAction.h +++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h @@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action { public: INHERIT_GIMPL_TYPES(Gimpl); + using Action::S; + using Action::Sinitial; + using Action::deriv; + using Action::refresh; + /////////////////////////// constructors explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index 1aeacbf2..7089fd1b 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -64,40 +64,6 @@ public: const std::vector &mom, int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); - template - static void MesonFieldGPU(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - std::vector gammas, - const std::vector &mom, - int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr); - /* - static void PionFieldWVmom(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - const std::vector &mom, - int orthogdim); - - static void PionFieldXX(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim, - int g5); - - static void PionFieldWV(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim); - static void PionFieldWW(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *wj, - int orthogdim); - static void PionFieldVV(Eigen::Tensor &mat, - const FermionField *vi, - const FermionField *vj, - int orthogdim); - */ - template // output: rank 5 tensor, e.g. Eigen::Tensor static void AslashField(TensorType &mat, const FermionField *lhs_wi, @@ -157,6 +123,211 @@ private: const int Ns, const int ss); }; +const int A2Ablocking=8; + +template using iVecSpinMatrix = iVector, Ns>, A2Ablocking>; +typedef iVecSpinMatrix VecSpinMatrix; +typedef iVecSpinMatrix vVecSpinMatrix; +typedef Lattice LatticeVecSpinMatrix; + +template using iVecComplex = iVector >, A2Ablocking>; +typedef iVecComplex VecComplex; +typedef iVecComplex vVecComplex; +typedef Lattice LatticeVecComplex; + +#define A2A_GPU_KERNELS +#ifdef A2A_GPU_KERNELS +template +template +void A2Autils::MesonField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + // assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + // const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecSpinMatrix MomSpinMat(grid); + + std::vector sliced; + for(int i=0;ioSites(),(size_t)Nsimd,{ + auto left = conjugate(lhs_v(ss)); + auto right = rhs_v(ss); + auto vv = SpinMat_v(ss); + for(int s1=0;s1(sliced[t],jj); + auto trSG = trace(tmp*Gamma(gammas[mu])); + mat(m,mu,t,i,j) = trSG()(); + } + } + } + } + }//jo + } +} + +// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) +// +// With: +// +// B_0 = A_0 + i A_1 +// B_1 = A_2 + i A_3 +// +// then in spin space +// +// ( 0 0 -conj(B_1) -B_0 ) +// i * A_mu g_mu = ( 0 0 -conj(B_0) B_1 ) +// ( B_1 B_0 0 0 ) +// ( conj(B_0) -conj(B_1) 0 0 ) + +template +template +void A2Autils::AslashField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + const std::vector &emB0, + const std::vector &emB1, + int orthogdim, double *t_kernel, double *t_gsum) +{ + const int block=A2Ablocking; + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + int Nem = emB0.size(); + assert(emB1.size() == Nem); + + // assert(Lblock % block==0); + // assert(Rblock % block==0); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + + LatticeVecSpinMatrix SpinMat(grid); + LatticeVecComplex Aslash(grid); + std::vector sliced; + + for(int i=0;ioSites(),(size_t)Nsimd,{ + auto left = conjugate(lhs_v(ss)); + auto right = rhs_v(ss); + auto vv = SpinMat_v(ss); + for(int s1=0;s1oSites(),(size_t)Nsimd,{ + auto vv = SpinMat_v(ss); + auto b0 = emB0_v(ss); + auto b1 = emB1_v(ss); + auto cb0 = conjugate(b0); + auto cb1 = conjugate(b1); + auto asl = Aslash_v(ss); + for(int j=jo;j template void A2Autils::MesonField(TensorType &mat, @@ -329,488 +500,41 @@ void A2Autils::MesonField(TensorType &mat, if (t_gsum) *t_gsum += usecond(); } -const int A2Ablocking=8; -template using iVecSpinMatrix = iVector, Ns>, A2Ablocking>; -typedef iVecSpinMatrix VecSpinMatrix; -typedef iVecSpinMatrix vVecSpinMatrix; -typedef Lattice LatticeVecSpinMatrix; - template template -void A2Autils::MesonFieldGPU(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - std::vector gammas, - const std::vector &mom, - int orthogdim, double *t_kernel, double *t_gsum) +void A2Autils::AslashField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + const std::vector &emB0, + const std::vector &emB1, + int orthogdim, double *t_kernel, double *t_gsum) { - const int block=A2Ablocking; - typedef typename FImpl::SiteSpinor vobj; - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; + typedef typename FermionField::vector_object vobj; + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + typedef iSinglet Singlet_v; + typedef iSinglet Singlet_s; + int Lblock = mat.dimension(3); int Rblock = mat.dimension(4); - - // assert(Lblock % block==0); - // assert(Rblock % block==0); GridBase *grid = lhs_wi[0].Grid(); const int Nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); - int Nt = grid->GlobalDimensions()[orthogdim]; - int Ngamma = gammas.size(); - int Nmom = mom.size(); - - - LatticeVecSpinMatrix SpinMat(grid); - LatticeVecSpinMatrix MomSpinMat(grid); - - RealD t_afor = 0.0; - RealD t_sum = 0.0; - RealD t_pha = 0.0; - RealD t_trace= 0.0; - uint64_t ncall=0; - - std::vector sliced; - for(int i=0;ioSites(),(size_t)Nsimd,{ - auto left = conjugate(lhs_v(ss)); - auto right = rhs_v(ss); - auto vv = SpinMat_v(ss); - for(int s1=0;s1 -void A2Autils::PionFieldXX(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim, - int g5) -{ - int Lblock = mat.dimension(1); - int Rblock = mat.dimension(2); - - GridBase *grid = wi[0].Grid(); - - const int nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; + int Nt = grid->GlobalDimensions()[orthogdim]; + int Nem = emB0.size(); + assert(emB1.size() == Nem); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock; - int MFlvol = ld*Lblock*Rblock; - - std::vector lvSum(MFrvol); - thread_for(r,MFrvol,{ - lvSum[r] = Zero(); - }); - - std::vector lsSum(MFlvol); - thread_for(r,MFlvol,{ - lsSum[r]=scalar_type(0.0); - }); - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - thread_for(r,rd,{ - - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n temp; - ExtractBuffer > extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; - - } - }} - }); - - assert(mat.dimension(0) == Nt); - // ld loop and local only?? - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld,{ - for(int pt=0;ptGlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock); -} - -template -void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - const std::vector &mom, - int orthogdim) -{ - int Lblock = mat.dimension(2); - int Rblock = mat.dimension(3); - - GridBase *grid = wi[0].Grid(); - const int nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Nmom = mom.size(); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - int MFrvol = rd*Lblock*Rblock*Nmom; - int MFlvol = ld*Lblock*Rblock*Nmom; - - std::vector lvSum(MFrvol); - thread_for(r,MFrvol,{ - lvSum[r] = Zero(); - }); - - std::vector lsSum(MFlvol); - thread_for(r,MFlvol,{ - lsSum[r]=scalar_type(0.0); - }); - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - thread_for(r,rd,{ - - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n temp; - ExtractBuffer > extracted(Nsimd); - - for(int i=0;iiCoorFromIindex(icoor,idx); - - int ldx = rt+icoor[orthogdim]*rd; - - int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; - - lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; - - } - }}} - }); - - assert(mat.dimension(0) == Nmom); - assert(mat.dimension(1) == Nt); - - int pd = grid->_processors[orthogdim]; - int pc = grid->_processor_coor[orthogdim]; - thread_for_collapse(2,lt,ld,{ - for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock); -} - -template -void A2Autils::PionFieldWV(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *vj, - int orthogdim) -{ - const int g5=1; - PionFieldXX(mat,wi,vj,orthogdim,g5); -} -template -void A2Autils::PionFieldWW(Eigen::Tensor &mat, - const FermionField *wi, - const FermionField *wj, - int orthogdim) -{ - const int nog5=0; - PionFieldXX(mat,wi,wj,orthogdim,nog5); -} -template -void A2Autils::PionFieldVV(Eigen::Tensor &mat, - const FermionField *vi, - const FermionField *vj, - int orthogdim) -{ - const int nog5=0; - PionFieldXX(mat,vi,vj,orthogdim,nog5); -} -*/ - -// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x) -// -// With: -// -// B_0 = A_0 + i A_1 -// B_1 = A_2 + i A_3 -// -// then in spin space -// -// ( 0 0 -conj(B_1) -B_0 ) -// i * A_mu g_mu = ( 0 0 -conj(B_0) B_1 ) -// ( B_1 B_0 0 0 ) -// ( conj(B_0) -conj(B_1) 0 0 ) -template -template -void A2Autils::AslashField(TensorType &mat, - const FermionField *lhs_wi, - const FermionField *rhs_vj, - const std::vector &emB0, - const std::vector &emB1, - int orthogdim, double *t_kernel, double *t_gsum) -{ - typedef typename FermionField::vector_object vobj; - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - typedef iSinglet Singlet_v; - typedef iSinglet Singlet_s; - - int Lblock = mat.dimension(3); - int Rblock = mat.dimension(4); - - GridBase *grid = lhs_wi[0].Grid(); - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - int Nt = grid->GlobalDimensions()[orthogdim]; - int Nem = emB0.size(); - assert(emB1.size() == Nem); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - // will locally sum vectors first // sum across these down to scalars // splitting the SIMD @@ -836,7 +560,7 @@ void A2Autils::AslashField(TensorType &mat, // Nested parallelism would be ok // Wasting cores here. Test case r if (t_kernel) *t_kernel = -usecond(); - thread_for(r,rd, + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -863,8 +587,8 @@ void A2Autils::AslashField(TensorType &mat, + left()(s2)(1) * right()(s1)(1) + left()(s2)(2) * right()(s1)(2); } - - // After getting the sitewise product do the mom phase loop + + // After getting the sitewise product do the mom phase loop int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, } } } - }); + } // Sum across simd lanes in the plane, breaking out orthog dir. thread_for(rt,rd, @@ -950,7 +674,7 @@ void A2Autils::AslashField(TensorType &mat, grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock); if (t_gsum) *t_gsum += usecond(); } - +#endif //////////////////////////////////////////// // Schematic thoughts about more generalised four quark insertion // @@ -1361,6 +1085,8 @@ Bag [8,4] fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17) // - 1602 }); } + + #ifdef DELTA_F_EQ_2 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Perhaps this should move out of the utils and into Hadrons module @@ -1592,5 +1318,534 @@ void A2Autils::DeltaFeq2(int dt_min,int dt_max, } #endif + /* + static void PionFieldWVmom(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + const std::vector &mom, + int orthogdim); + + static void PionFieldXX(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim, + int g5); + + static void PionFieldWV(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim); + static void PionFieldWW(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *wj, + int orthogdim); + static void PionFieldVV(Eigen::Tensor &mat, + const FermionField *vi, + const FermionField *vj, + int orthogdim); + */ + +/* + +template +template +void A2Autils::MesonField(TensorType &mat, + const FermionField *lhs_wi, + const FermionField *rhs_vj, + std::vector gammas, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + + int Lblock = mat.dimension(3); + int Rblock = mat.dimension(4); + + GridBase *grid = lhs_wi[0].Grid(); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammas.size(); + int Nmom = mom.size(); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nmom; + int MFlvol = ld*Lblock*Rblock*Nmom; + + std::vector lvSum(MFrvol); + for(int r=0;r lsSum(MFlvol); + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + // potentially wasting cores here if local time extent too small + if (t_kernel) *t_kernel = -usecond(); + for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + + } + }}} + } + if (t_kernel) *t_kernel += usecond(); + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Ngamma); + assert(mat.dimension(2) == Nt); + + // ld loop and local only?? + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); + if (t_gsum) *t_gsum += usecond(); +} + +template +void A2Autils::PionFieldXX(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim, + int g5) +{ + int Lblock = mat.dimension(1); + int Rblock = mat.dimension(2); + + GridBase *grid = wi[0].Grid(); + + const int nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock; + int MFlvol = ld*Lblock*Rblock; + + std::vector lvSum(MFrvol); + thread_for(r,MFrvol,{ + lvSum[r] = Zero(); + }); + + std::vector lsSum(MFlvol); + thread_for(r,MFlvol,{ + lsSum[r]=scalar_type(0.0); + }); + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + thread_for(r,rd,{ + + int so=r*grid->_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n temp; + ExtractBuffer > extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; + + } + }} + }); + + assert(mat.dimension(0) == Nt); + // ld loop and local only?? + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock); +} + +template +void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + const std::vector &mom, + int orthogdim) +{ + int Lblock = mat.dimension(2); + int Rblock = mat.dimension(3); + + GridBase *grid = wi[0].Grid(); + + const int nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Nmom = mom.size(); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*Lblock*Rblock*Nmom; + int MFlvol = ld*Lblock*Rblock*Nmom; + + std::vector lvSum(MFrvol); + thread_for(r,MFrvol,{ + lvSum[r] = Zero(); + }); + + std::vector lsSum(MFlvol); + thread_for(r,MFlvol,{ + lsSum[r]=scalar_type(0.0); + }); + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + thread_for(r,rd,{ + + int so=r*grid->_ostride[orthogdim]; // base offset for start of plane + + for(int n=0;n temp; + ExtractBuffer > extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; + + } + }}} + }); + + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Nt); + + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + thread_for_collapse(2,lt,ld,{ + for(int pt=0;ptGlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock); +} + +template +void A2Autils::PionFieldWV(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *vj, + int orthogdim) +{ + const int g5=1; + PionFieldXX(mat,wi,vj,orthogdim,g5); +} +template +void A2Autils::PionFieldWW(Eigen::Tensor &mat, + const FermionField *wi, + const FermionField *wj, + int orthogdim) +{ + const int nog5=0; + PionFieldXX(mat,wi,wj,orthogdim,nog5); +} +template +void A2Autils::PionFieldVV(Eigen::Tensor &mat, + const FermionField *vi, + const FermionField *vj, + int orthogdim) +{ + const int nog5=0; + PionFieldXX(mat,vi,vj,orthogdim,nog5); +} +*/ + NAMESPACE_END(Grid); diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc index c19d3dbb..139e7957 100644 --- a/Grid/util/FlightRecorder.cc +++ b/Grid/util/FlightRecorder.cc @@ -280,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) if(LoggingMode == LoggingModeNone) return; if ( ChecksumCommsSend ){ - uint64_t *ubuf = (uint64_t *)buf; - if(LoggingMode == LoggingModeNone) return; + + if(LoggingMode == LoggingModeNone) return; #ifdef GRID_SYCL + uint64_t *ubuf = (uint64_t *)buf; uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); if(LoggingMode == LoggingModePrint) { std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor < L_list({8,12,16,24,32}); + std::vector L_list({8,12,16,24}); int selm1=sel-1; std::vector clover; diff --git a/examples/Example_taku.cc b/examples/Example_taku.cc deleted file mode 100644 index b9ad272e..00000000 --- a/examples/Example_taku.cc +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Warning: This code illustrative only: not well tested, and not meant for production use - * without regression / tests being applied - */ - -#include - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-8,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=3; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, - {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, - {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} - }; - - Gamma G5(Gamma::Algebra::Gamma5); - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=1.8; - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - RealD P=0.5871119; // 48I - // RealD P=0.6153342; // 64I - // RealD P=0.6388238 // 32Ifine - RealD u0 = sqrt(sqrt(P)); - RealD M5mf = M5 - 4.0*(1.0-u0); - RealD w0 = 1.0 - M5mf; -#if 0 - // M5=1.8 with U=u0 - Umu = Umu * u0; - LLscale = 1.0; - LCscale = 1.0; - std::cout< PointProps(nmass,UGrid); - // std::vector GaussProps(nmass,UGrid); - // std::vector Z2Props (nmass,UGrid); - - for(int m=0;m - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = source.Grid(); - GridBase *FGrid = D.FermionGrid(); - bool fiveD = true; //calculate 5d free propagator - RealD mass = D.Mass(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - LatticeFermion result5(FGrid); - LatticeFermion src5(FGrid); - LatticePropagator prop5(FGrid); - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - D.FreePropagator(src5,result5,mass,true); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - - LatticePropagator Vector_mu(UGrid); - LatticeComplex VV (UGrid); - std::vector sumVV; - Gamma::Algebra GammaV[3] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ - }; - for( int mu=0;mu<3;mu++ ) { - Gamma gV(GammaV[mu]); - D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); - VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current - sliceSum(VV,sumVV,Tdir); - int Nt = sumVV.size(); - for(int t=0;t -void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - bool fiveD = false; //calculate 4d free propagator - RealD mass = D.Mass(); - GridBase *UGrid = source.Grid(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - for(int s=0;s(src4,source,s,c); - D.FreePropagator(src4,result4,mass,false); - FermToProp(propagator,result4,s,c); - } - } -} - -template -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-10,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=4; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5}, - {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5}, - {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5}, - {Gamma::Algebra::Identity,Gamma::Algebra::Identity} - }; - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=atof(getenv("M5")); - RealD mq = atof(getenv("mass")); - int tadpole = atof(getenv("tadpole")); - std::vector masses({ mq} ); // u/d, s, c ?? - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - // RealD P=0.6388238 // 32Ifine - // RealD P=0.6153342; // 64I - RealD P=0.5871119; // 48I - RealD u0 = sqrt(sqrt(P)); - RealD w0 = 1 - M5; - std::cout< boundary = {1,1,1,-1}; - FermionActionD::ImplParams Params(boundary); - RealD b=1.5; - RealD c=0.5; - std::cout< PointProps(nmass,UGrid); - // std::vector FreeProps(nmass,UGrid); - // LatticePropagator delta(UGrid); - - for(int m=0;m - -using namespace std; -using namespace Grid; - -RealD LLscale =1.0; -RealD LCscale =1.0; - -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - -void MakePhase(Coordinate mom,LatticeComplex &phase) -{ - GridBase *grid = phase.Grid(); - auto latt_size = grid->GlobalDimensions(); - ComplexD ci(0.0,1.0); - phase=Zero(); - - LatticeComplex coor(phase.Grid()); - for(int mu=0;mu -void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) -{ - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - Integer Iterations = 40; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - Field tmp(U.Grid()); - smeared=unsmeared; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(smeared,tmp); - smeared = smeared - coeff*tmp; - std::cout << " smear iter " << n<<" " < -void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = source.Grid(); - GridBase *FGrid = D.FermionGrid(); - bool fiveD = true; //calculate 4d free propagator - RealD mass = D.Mass(); - LatticeFermion src4 (UGrid); - LatticeFermion result4 (UGrid); - LatticeFermion result5(FGrid); - LatticeFermion src5(FGrid); - LatticePropagator prop5(FGrid); - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - D.FreePropagator(src5,result5,mass,true); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - - LatticePropagator Vector_mu(UGrid); - LatticeComplex VV (UGrid); - std::vector sumVV; - Gamma::Algebra GammaV[3] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ - }; - for( int mu=0;mu<3;mu++ ) { - Gamma gV(GammaV[mu]); - D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); - VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current - sliceSum(VV,sumVV,Tdir); - int Nt = sumVV.size(); - for(int t=0;t -void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) -{ - GridBase *UGrid = D.GaugeGrid(); - GridBase *FGrid = D.FermionGrid(); - - LatticeFermion src4 (UGrid); - LatticeFermion src5 (FGrid); - LatticeFermion result5(FGrid); - LatticeFermion result4(UGrid); - LatticePropagator prop5(FGrid); - - ConjugateGradient CG(1.0e-6,100000); - SchurRedBlackDiagMooeeSolve schur(CG); - ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors - for(int s=0;s(src4,source,s,c); - - D.ImportPhysicalFermionSource(src4,src5); - - result5=Zero(); - schur(D,src5,result5,ZG); - std::cout<(prop5,result5,s,c); - FermToProp(propagator,result4,s,c); - } - } - LatticePropagator Axial_mu(UGrid); - LatticePropagator Vector_mu(UGrid); - - LatticeComplex PA (UGrid); - LatticeComplex VV (UGrid); - LatticeComplex PJ5q(UGrid); - LatticeComplex PP (UGrid); - - std::vector sumPA; - std::vector sumVV; - std::vector sumPP; - std::vector sumPJ5q; - - Gamma g5(Gamma::Algebra::Gamma5); - D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); - PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current - sliceSum(PA,sumPA,Tdir); - - int Nt{static_cast(sumPA.size())}; - - for(int t=0;t >, data); -}; - -void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) -{ - const int nchannel=3; - Gamma::Algebra Gammas[nchannel][2] = { - {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, - {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, - // {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} - {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5} - }; - - Gamma G5(Gamma::Algebra::Gamma5); - - LatticeComplex meson_CF(q1.Grid()); - MesonFile MF; - - for(int ch=0;ch meson_T; - sliceSum(meson_CF,meson_T, Tdir); - - int nt=meson_T.size(); - - std::vector corr(nt); - for(int t=0;t seeds4({1,2,3,4}); - // GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - - LatticeGaugeField Umu(UGrid); - std::string config; - RealD M5=atof(getenv("M5")); - RealD mq = atof(getenv("mass")); - std::vector masses({ mq} ); // u/d, s, c ?? - if( argc > 1 && argv[1][0] != '-' ) - { - std::cout<::ColdConfiguration(Umu); - config="ColdConfig"; - // RealD P=1.0; // Don't scale - // RealD P=0.6153342; // 64I - // RealD P=0.6388238 // 32Ifine - // RealD P=0.5871119; // 48I - // RealD u0 = sqrt(sqrt(P)); - // Umu = Umu * u0; - RealD w0 = 1 - M5; - LLscale = 1.0/(1-w0*w0)/(1-w0*w0); - LCscale = 1.0/(1-w0*w0)/(1-w0*w0); - std::cout< PointProps(nmass,UGrid); - std::vector FreeProps(nmass,UGrid); - LatticePropagator delta(UGrid); - - for(int m=0;m phi(VDIM,&grid); + std::vector B0(Nem,&grid); + std::vector B1(Nem,&grid); std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; for (unsigned int i = 0; i < VDIM; ++i){ random(pRNG,phi[i]); } + for (unsigned int i = 0; i < Nem; ++i){ + random(pRNG,B0[i]); + random(pRNG,B1[i]); + } std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; // Gamma matrices used in the contraction std::vector Gmu = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT, Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, @@ -74,11 +85,15 @@ int main(int argc, char *argv[]) std::vector> momenta = { {0.,0.,0.}, {1.,0.,0.}, + {-1.,0.,0.}, + {0,1.,0.}, + {0,-1.,0.}, + {0,0,1.}, + {0,0,-1.}, {1.,1.,0.}, {1.,1.,1.}, {2.,0.,0.} }; - // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; std::cout << GridLogMessage << "Computing complex phases" << std::endl; @@ -98,46 +113,28 @@ int main(int argc, char *argv[]) std::cout << GridLogMessage << "Computing complex phases done." << std::endl; Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); - Eigen::Tensor Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor App(B0.size(),1,Nt,VDIM,VDIM); // timer double start,stop; + ///////////////////////////////////////////////////////////////////////// //execute meson field routine - std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl; + ///////////////////////////////////////////////////////////////////////// A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); - std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl; start = usecond(); A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); stop = usecond(); std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; - std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl; - A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); - std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl; + ///////////////////////////////////////////////////////////////////////// + //execute aslash field routine + ///////////////////////////////////////////////////////////////////////// + A2Autils::AslashField(App,&phi[0],&phi[0],B0,B1,Tp); start = usecond(); - A2Autils::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp); + A2Autils::AslashField(App,&phi[0],&phi[0],B0,B1,Tp); stop = usecond(); - std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl; - - for(int mom=0;mom Date: Wed, 23 Oct 2024 14:44:04 -0400 Subject: [PATCH 35/50] NVCC happy --- Grid/qcd/utils/SUn.impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index 02fa161b..d049fcd0 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix &ta) { //////////////////////////////////////////////////////////////////////// // Map a su2 subgroup number to the pair of rows that are non zero //////////////////////////////////////////////////////////////////////// -static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { +static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); int spare = su2_index; From 565b231c0395703210a2472ff5e279eac9067347 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:44:17 -0400 Subject: [PATCH 36/50] Nvcc happy --- Grid/qcd/utils/Sp2n.impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/utils/Sp2n.impl.h b/Grid/qcd/utils/Sp2n.impl.h index 4c660d3a..196aba7e 100644 --- a/Grid/qcd/utils/Sp2n.impl.h +++ b/Grid/qcd/utils/Sp2n.impl.h @@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix &ta) { // Map a su2 subgroup number to the pair of rows that are non zero //////////////////////////////////////////////////////////////////////// template -static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { +static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { const int nsp=ncolour/2; assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); From 655c79f39e0b5b72339676fc45d1ea4f7726548f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:44:41 -0400 Subject: [PATCH 37/50] Suppress warning on partial override --- Grid/qcd/action/gauge/PlaqPlusRectangleAction.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index b9d6ac16..b7f31d0e 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -40,6 +40,11 @@ public: INHERIT_GIMPL_TYPES(Gimpl); + using Action::S; + using Action::Sinitial; + using Action::deriv; + using Action::refresh; + private: RealD c_plaq; RealD c_rect; From 5603464f39f50ea8f0f620600189031c7ca99cc7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:45:58 -0400 Subject: [PATCH 38/50] Fix in partial fraction import/export physical and make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class --- Grid/qcd/action/fermion/AbstractEOFAFermion.h | 10 ++ Grid/qcd/action/fermion/CayleyFermion5D.h | 11 ++ .../fermion/OverlapWilsonCayleyTanhFermion.h | 2 +- .../OverlapWilsonCayleyZolotarevFermion.h | 4 + .../OverlapWilsonContfracTanhFermion.h | 3 + .../OverlapWilsonContfracZolotarevFermion.h | 3 + .../OverlapWilsonPartialFractionTanhFermion.h | 3 + ...lapWilsonPartialFractionZolotarevFermion.h | 5 + .../action/fermion/PartialFractionFermion5D.h | 2 +- Grid/qcd/action/fermion/WilsonCompressor.h | 23 --- .../CayleyFermion5DImplementation.h | 14 +- .../implementation/CayleyFermion5Dcache.h | 32 ++-- .../DomainWallEOFAFermionCache.h | 50 +++--- .../implementation/MobiusEOFAFermionCache.h | 165 +++++++++--------- .../PartialFractionFermion5DImplementation.h | 5 +- 15 files changed, 179 insertions(+), 153 deletions(-) diff --git a/Grid/qcd/action/fermion/AbstractEOFAFermion.h b/Grid/qcd/action/fermion/AbstractEOFAFermion.h index 18bcb394..3c203d17 100644 --- a/Grid/qcd/action/fermion/AbstractEOFAFermion.h +++ b/Grid/qcd/action/fermion/AbstractEOFAFermion.h @@ -55,6 +55,11 @@ public: RealD alpha; // Mobius scale RealD k; // EOFA normalization constant + // Device resident + deviceVector d_shift_coefficients; + deviceVector d_MooeeInv_shift_lc; + deviceVector d_MooeeInv_shift_norm; + virtual void Instantiatable(void) = 0; // EOFA-specific operations @@ -92,6 +97,11 @@ public: this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); + + d_shift_coefficients.resize(Ls); + d_MooeeInv_shift_lc.resize(Ls); + d_MooeeInv_shift_norm.resize(Ls); + }; }; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index 2c56c7ed..c8fbe5a8 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -143,6 +143,17 @@ public: std::vector ueem; std::vector dee; + // Device memory + deviceVector d_diag; + deviceVector d_upper; + deviceVector d_lower; + + deviceVector d_lee; + deviceVector d_dee; + deviceVector d_uee; + deviceVector d_leem; + deviceVector d_ueem; + // Matrices of 5d ee inverse params // std::vector > MatpInv; // std::vector > MatmInv; diff --git a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h index 350e89e2..8f0c91eb 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h @@ -42,7 +42,7 @@ public: void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { this->MomentumSpacePropagatorHw(out,in,_m,twist); - }; + }; // Constructors OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, diff --git a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h index d15690fa..33e59b88 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h @@ -41,6 +41,10 @@ public: public: // Constructors + virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h index 9d1a9a86..5b603017 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h @@ -41,6 +41,9 @@ public: public: virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonContFracTanhFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h index ce796d4a..747cb508 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h @@ -40,6 +40,9 @@ public: INHERIT_IMPL_TYPES(Impl); virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h index f2fb46cd..7210d6af 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h @@ -41,6 +41,9 @@ public: public: virtual void Instantiatable(void){}; + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; // Constructors OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h index f98b64a9..f0be4388 100644 --- a/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h +++ b/Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h @@ -40,6 +40,11 @@ public: INHERIT_IMPL_TYPES(Impl); virtual void Instantiatable(void){}; + + void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { + this->MomentumSpacePropagatorHw(out,in,_m,twist); + }; + // Constructors OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index e50a9922..47406730 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D public: INHERIT_IMPL_TYPES(Impl); - const int part_frac_chroma_convention=1; + const int part_frac_chroma_convention=0; void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index baa1f684..605bdcec 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -414,29 +414,6 @@ public: // surface_list.resize(0); this->same_node.resize(npoints); }; - - /* - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - this->same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } - */ template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 8dc4fbc8..69b5b02c 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -488,7 +488,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector::SetCoefficientsInternal(RealD zolo_hi,std::vectorMooeeInternalCompute(0,inv,MatpInv,MatmInv); // this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index d3d88cbf..5fbc7612 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -57,9 +57,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, int Ls =this->Ls; - static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); auto pdiag = &d_diag[0]; auto pupper = &d_upper[0]; @@ -99,9 +99,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, int Ls=this->Ls; - static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); auto pdiag = &d_diag[0]; auto pupper = &d_upper[0]; @@ -134,11 +134,11 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto plee = & d_lee [0]; auto pdee = & d_dee [0]; @@ -196,11 +196,11 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi autoView(psi , psi_i,AcceleratorRead); autoView(chi , chi_i,AcceleratorWrite); - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto plee = & d_lee [0]; auto pdee = & d_dee [0]; diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 8a9a0ffa..ae126bb5 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -51,13 +51,13 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -89,14 +89,14 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio autoView( phi , phi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + + acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -125,18 +125,18 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); - - auto plee = & d_lee [0]; - auto pdee = & d_dee [0]; - auto puee = & d_uee [0]; - auto pleem = & d_leem[0]; - auto pueem = & d_ueem[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); + uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 4827e516..b9165edb 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -50,14 +50,14 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField assert(phi.Checkerboard() == psi.Checkerboard()); - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -93,15 +93,15 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion assert(phi.Checkerboard() == psi.Checkerboard()); - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); - - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; - auto pshift_coeffs = &d_shift_coeffs[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + auto pshift_coeffs = &this->d_shift_coefficients[0]; + + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -138,14 +138,14 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -180,16 +180,16 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm assert(phi.Checkerboard() == psi.Checkerboard()); - static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); - static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); - static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); - static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); - - auto pdiag = &d_diag[0]; - auto pupper = &d_upper[0]; - auto plower = &d_lower[0]; - auto pshift_coeffs = &d_shift_coeffs[0]; + auto pdiag = &this->d_diag[0]; + auto pupper = &this->d_upper[0]; + auto plower = &this->d_lower[0]; + auto pshift_coeffs = &this->d_shift_coefficients[0]; + acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); + // Flops = 6.0*(Nc*Ns) *Ls*vol auto pm = this->pm; @@ -230,17 +230,17 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; - auto plee = & d_lee [0]; - auto pdee = & d_dee [0]; - auto puee = & d_uee [0]; - auto pleem = & d_leem[0]; - auto pueem = & d_ueem[0]; + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } @@ -293,23 +293,22 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF autoView(chi , chi_i, AcceleratorWrite); // Move into object and constructor - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); - auto pm = this->pm; - auto plee = & d_lee [0]; - auto pdee = & d_dee [0]; - auto puee = & d_uee [0]; - auto pleem = & d_leem[0]; - auto pueem = & d_ueem[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; + auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0]; + auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0]; - static deviceVector d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); - static deviceVector d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); - auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0]; - auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0]; + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -367,17 +366,17 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto plee = &this->d_lee [0]; + auto pdee = &this->d_dee [0]; + auto puee = &this->d_uee [0]; + auto pleem = &this->d_leem[0]; + auto pueem = &this->d_ueem[0]; - auto plee = & d_lee [0]; - auto pdee = & d_dee [0]; - auto puee = & d_uee [0]; - auto pleem = & d_leem[0]; - auto pueem = & d_ueem[0]; + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -426,25 +425,23 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; - static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); - static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); - static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); - auto pm = this->pm; - auto plee = & d_lee [0]; - auto pdee = & d_dee [0]; - auto puee = & d_uee [0]; - auto pleem = & d_leem[0]; - auto pueem = & d_ueem[0]; + auto plee = & this->d_lee [0]; + auto pdee = & this->d_dee [0]; + auto puee = & this->d_uee [0]; + auto pleem = & this->d_leem[0]; + auto pueem = & this->d_ueem[0]; - static deviceVector d_MooeeInvDag_shift_lc(Ls); - static deviceVector d_MooeeInvDag_shift_norm(Ls); - acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); - acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); - auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0]; - auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0]; + auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0]; + auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0]; + + acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index 0206828b..93684929 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -411,17 +411,18 @@ void PartialFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,App int Ls = this->Ls; conformable(solution5d.Grid(),this->FermionGrid()); conformable(exported4d.Grid(),this->GaugeGrid()); - ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); + ExtractSlice(exported4d, solution5d, Ls-1, 0); } template void PartialFractionFermion5D::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) { + //void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice, int orthog) int Ls = this->Ls; conformable(imported5d.Grid(),this->FermionGrid()); conformable(input4d.Grid() ,this->GaugeGrid()); FermionField tmp(this->FermionGrid()); tmp=Zero(); - InsertSlice(input4d, tmp, Ls-1, Ls-1); + InsertSlice(input4d, tmp, Ls-1, 0); tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; this->Dminus(tmp,imported5d); } From 368d649c8acb8e7f6c90abd097c6a862cfbfb318 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:47:55 -0400 Subject: [PATCH 39/50] feature/deprecate-uvm happier -- preallocate device resident neigbour table --- Grid/stencil/Stencil.h | 84 ++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 7ace084c..917a10be 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -121,17 +121,22 @@ class CartesianStencilAccelerator { StencilVector same_node; Coordinate _simd_layout; Parameters parameters; + ViewMode mode; StencilEntry* _entries_p; + StencilEntry* _entries_host_p; cobj* u_recv_buf_p; cobj* u_send_buf_p; accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) const { - return this->_entries_p[point+this->_npoints*osite]._is_local; + // Not a device function + inline int GetNodeLocal(int osite,int point) const { + StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite]; + return SE._is_local; } accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { - ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; + ptype = this->_permute_type[point]; + return & this->_entries_p[point+this->_npoints*osite]; } accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { @@ -164,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator &refer_to_me,ViewMode _mode) - : CartesianStencilAccelerator(refer_to_me), - cpu_ptr(this->_entries_p), - mode(_mode) + : CartesianStencilAccelerator(refer_to_me) { - this->_entries_p =(StencilEntry *) - MemoryManager::ViewOpen(this->_entries_p, - this->_npoints*this->_osites*sizeof(StencilEntry), - mode, - AdviseDefault); + this->ViewOpen(_mode); + } + void ViewOpen(ViewMode _mode) + { + this->mode = _mode; } - void ViewClose(void) - { - MemoryManager::ViewClose(this->cpu_ptr,this->mode); - } + void ViewClose(void) { } }; @@ -274,8 +273,8 @@ public: std::vector > > face_table ; deviceVector surface_list; - std::vector _entries; // Resident in host memory - deviceVector _entries_device; // Resident in device memory + std::vector _entries; // Resident in host memory + deviceVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -626,10 +625,10 @@ public: //////////////////////////////////////// void PrecomputeByteOffsets(void){ for(int i=0;i<_entries.size();i++){ - if( _entries[i]._is_local ) { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); + if( this->_entries[i]._is_local ) { + this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj); } else { - _entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); + this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj); } } }; @@ -764,7 +763,13 @@ public: this->_osites = _grid->oSites(); _entries.resize(this->_npoints* this->_osites); - this->_entries_p = &_entries[0]; + _entries_device.resize(this->_npoints* this->_osites); + this->_entries_host_p = &_entries[0]; + this->_entries_p = &_entries_device[0]; + + std::cout << GridLogMessage << " Stencil object allocated for "<_osites + <<" sites table "<_entries_p<< " GridPtr "<<_grid<ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } PrecomputeByteOffsets(); + acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry)); } void Local (int point, int dimension,int shiftpm,int cbmask) @@ -996,10 +1002,10 @@ public: for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ int idx=point+(lo+o+b)*this->_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._permute=permute; - _entries[idx]._is_local=1; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =ro+o+b; + this->_entries[idx]._permute=permute; + this->_entries[idx]._is_local=1; + this->_entries[idx]._around_the_world=wrap; } o +=_grid->_slice_stride[dimension]; } @@ -1017,10 +1023,10 @@ public: if ( ocb&cbmask ) { int idx = point+(lo+o+b)*this->_npoints; - _entries[idx]._offset =ro+o+b; - _entries[idx]._is_local=1; - _entries[idx]._permute=permute; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =ro+o+b; + this->_entries[idx]._is_local=1; + this->_entries[idx]._permute=permute; + this->_entries[idx]._around_the_world=wrap; } } @@ -1044,10 +1050,10 @@ public: for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int b=0;b<_grid->_slice_block[dimension];b++){ int idx=point+(so+o+b)*this->_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute=0; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =offset+(bo++); + this->_entries[idx]._is_local=0; + this->_entries[idx]._permute=0; + this->_entries[idx]._around_the_world=wrap; } o +=_grid->_slice_stride[dimension]; } @@ -1064,10 +1070,10 @@ public: int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { int idx = point+(so+o+b)*this->_npoints; - _entries[idx]._offset =offset+(bo++); - _entries[idx]._is_local=0; - _entries[idx]._permute =0; - _entries[idx]._around_the_world=wrap; + this->_entries[idx]._offset =offset+(bo++); + this->_entries[idx]._is_local=0; + this->_entries[idx]._permute =0; + this->_entries[idx]._around_the_world=wrap; } } o +=_grid->_slice_stride[dimension]; From 63abe87f3644b8e84de38182bbd851cb43e5260c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:49:13 -0400 Subject: [PATCH 40/50] Memory manager verbose improvements that were useful to track an error --- Grid/allocator/MemoryManagerCache.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index e5cc0c42..b53e1510 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -6,11 +6,10 @@ NAMESPACE_BEGIN(Grid); #define MAXLINE 512 static char print_buffer [ MAXLINE ]; -#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; -#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl; //#define dprintf(...) - //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -168,7 +167,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -183,7 +182,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n", + (uint64_t)AccCache.bytes, + (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -264,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", + dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, From d9f430a575ea4d8e2efafd4cf8e4274abe790369 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 14:51:16 -0400 Subject: [PATCH 41/50] Happy GPU --- tests/core/Test_fftf.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index e5b6f75b..be50fd31 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -39,7 +39,8 @@ int main (int argc, char ** argv) std::cout< Date: Wed, 23 Oct 2024 14:52:15 -0400 Subject: [PATCH 42/50] GPU happy --- tests/core/Test_fft_pf.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/core/Test_fft_pf.cc b/tests/core/Test_fft_pf.cc index 84e70a83..d60f37ee 100644 --- a/tests/core/Test_fft_pf.cc +++ b/tests/core/Test_fft_pf.cc @@ -38,7 +38,7 @@ int main (int argc, char ** argv) std::cout< HermOp(Dov); ConjugateGradient CG(1.0e-8,10000); CG(HermOp,src5,result5); + std::cout << " Solved by Conjugate Gradient (CGNE)" < Date: Wed, 23 Oct 2024 15:14:16 -0400 Subject: [PATCH 43/50] Verbose reduce --- Grid/algorithms/FFT.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 3c2eb428..dc972537 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -215,7 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); - std::cout << "Making FFTW plan" << std::endl; + std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -229,7 +229,7 @@ public: } // Barrel shift and collect global pencil - std::cout << "Making pencil" << std::endl; + std::cout << GridLogPerformance<<"Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -251,7 +251,7 @@ public: } } - std::cout << "Looping orthog" << std::endl; + std::cout <::fftw_destroy_plan(p); #endif From eafc150034a3b9eda8a7c4a8a66f1aac9ccf6eb4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 23 Oct 2024 16:46:26 -0400 Subject: [PATCH 44/50] Test fft asserts --- tests/core/Test_fft.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index e60d3555..16ee5a0f 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -39,7 +39,7 @@ int main (int argc, char ** argv) std::cout< Date: Mon, 11 Nov 2024 23:11:11 +0000 Subject: [PATCH 45/50] Dslash testing for reproduce --- Grid/qcd/action/fermion/WilsonFermion5D.h | 3 + Grid/qcd/action/fermion/WilsonKernels.h | 4 + .../WilsonFermion5DImplementation.h | 23 ++ .../WilsonKernelsImplementation.h | 56 +++- tests/Test_dwf_dslash_repro.cc | 239 ++++++++++++++++++ 5 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 tests/Test_dwf_dslash_repro.cc diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index dd83f269..40c1871f 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -119,6 +119,9 @@ public: void DhopOE(const FermionField &in, FermionField &out,int dag); void DhopEO(const FermionField &in, FermionField &out,int dag); + void DhopComms (const FermionField &in, FermionField &out); + void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids); + // add a DhopComm // -- suboptimal interface will presently trigger multiple comms. void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 2d868c27..ad077dd3 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -57,6 +57,10 @@ public: int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; + static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids); + static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index ec3bd94a..92de5a40 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -438,6 +438,29 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int DhopInternal(StencilOdd,UmuEven,in,out,dag); } +template +void WilsonFermion5D::DhopComms(const FermionField &in, FermionField &out) +{ + int dag =0 ; + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + Compressor compressor(dag); + Stencil.HaloExchangeOpt(in,compressor); +} +template +void WilsonFermion5D::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids) +{ + conformable(in.Grid(),FermionGrid()); // verifies full grid + conformable(in.Grid(),out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + int LLs = in.Grid()->_rdimensions[0]; + int Opt = WilsonKernelsStatic::Opt; + Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids); +} + template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 43662b9c..1d0dfb61 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -411,6 +411,46 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } +#ifdef GRID_SYCL +extern "C" { + ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void ); + uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void ); + void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value ); +} +#ifdef GRID_SIMT +#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id()) +#else +#define MAKE_ID(A) (0) +#endif + +#else + +#define MAKE_ID(A) (0) + +#endif + + +#define KERNEL_CALL_ID(A) \ + const uint64_t NN = Nsite*Ls; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ + const int Nsimd = SiteHalfSpinor::Nsimd(); \ + const int lane=acceleratorSIMTlane(Nsimd); \ + int idx=sF*Nsimd+lane; \ + uint64_t id = MAKE_ID(); \ + ids[idx]=id; \ + }); \ + accelerator_barrier(); #define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ @@ -418,7 +458,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss; \ int sU = ss/Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ - }); + }); #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); @@ -451,6 +491,8 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ });} + + template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, @@ -485,6 +527,18 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField } assert(0 && " Kernel optimisation case not covered "); } + +template +void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, + int Ls, int Nsite, const FermionField &in, FermionField &out, + uint64_t *ids) +{ + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); + KERNEL_CALL_ID(GenericDhopSite); +} template void WilsonKernels::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, diff --git a/tests/Test_dwf_dslash_repro.cc b/tests/Test_dwf_dslash_repro.cc new file mode 100644 index 00000000..1bf813d9 --- /dev/null +++ b/tests/Test_dwf_dslash_repro.cc @@ -0,0 +1,239 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_cg_prec.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +#endif + +typedef LatticeFermionD FermionField; + +int VerifyOnDevice(const FermionField &res, FermionField &ref) +{ + deviceVector Fails(1); + int * Fail = &Fails[0]; + int FailHost=0; + + typedef typename FermionField::vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const uint64_t NN = res.Grid()->oSites(); + + acceleratorPut(*Fail,FailHost); + + accelerator_barrier(); + // Inject an error + + int injection=0; + if(getenv("GRID_ERROR_INJECT")) injection=1; + autoView(res_v,res,AcceleratorWrite); + autoView(ref_v,ref,AcceleratorRead); + if ( res.Grid()->ThisRank()== 0 ) + { + if (((random()&0xF)==0)&&injection) { + uint64_t sF = random()%(NN); + int lane=0; + printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank()); + auto vv = acceleratorGet(res_v[sF]); + double *dd = (double *)&vv; + *dd=M_PI; + acceleratorPut(res_v[sF],vv); + } + } + + accelerator_for( sF, NN, vobj::Nsimd(), { +#ifdef GRID_SIMT + { + int blane = acceleratorSIMTlane(vobj::Nsimd()); +#else + for(int blane;blaneoSites(); + + /////////////////////////////// + // Pull back to host + /////////////////////////////// + autoView(res_v,res,CpuRead); + autoView(ref_v,ref,CpuRead); + + std::vector ids_host(NN*Nsimd); + + acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t)); + + ////////////////////////////////////////////////////////////// + // Redo check on host and print IDs + ////////////////////////////////////////////////////////////// + + for(int ss=0;ss< NN; ss++){ + int sF = ss; + for(int lane=0;lane>0 )&0xFF; + int slice =(id>>8 )&0xFF; + int eu =(id>>16)&0xFF; + std::cout << GridHostname()<<" miscompare site "< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + LatticeFermionD src(FGrid); random(RNG5,src); + LatticeFermionD junk(FGrid); random(RNG5,junk); + + LatticeFermionD result(FGrid); result=Zero(); + LatticeFermionD ref(FGrid); ref=Zero(); + + SU::HotConfiguration(RNG4,Umu); + + RealD mass=0.1; + RealD M5=1.8; + + DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl; + UGrid->Barrier(); + std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl; + + std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<Broadcast(0,(void *)&start,sizeof(start)); + + FlightRecorder::ContinueOnFail = 0; + FlightRecorder::PrintEntireLog = 0; + FlightRecorder::ChecksumComms = 0; + FlightRecorder::ChecksumCommsSend=0; + + if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s); + if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s); + + const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd(); + + deviceVector ids_device(NN); + uint64_t *ids = &ids_device[0]; + + + Ddwf.DhopComms(src,ref); + Ddwf.DhopCalc(src,ref,ids); + + Ddwf.DhopComms(src,result); + + int iter=0; + do { + + result=junk; + + Ddwf.DhopCalc(src,result,ids); + + if ( VerifyOnDevice(result, ref) ) { + printf("Node %s Iter %d detected fails\n",GridHostname(),iter); + PrintFails(result,ref,ids); + // std::cout << " Dslash "<Broadcast(0,(void *)&now,sizeof(now)); + } while (now < (start + nsecs) ); + + + Grid_finalize(); +} From 1caf8b0f868e36f2405648efd51c1ddcffe1ae1b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Jan 2025 15:22:37 +0000 Subject: [PATCH 46/50] Rename --- systems/Aurora/benchmarks/gpu_tile.sh | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 systems/Aurora/benchmarks/gpu_tile.sh diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh new file mode 100755 index 00000000..8e485a06 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); +#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) + +export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ); +export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); +export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) + +export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_AFFINITY_MASK=$gpu_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " + +if [ $PALS_RANKID = "0" ] +then +# numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" + numactl -m $NUMAM -N $NUMAP "$@" +else + numactl -m $NUMAM -N $NUMAP "$@" +fi From 74a4f4394690dc872afb1f93e3d49c97a35f46f0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 28 Jan 2025 15:22:46 +0000 Subject: [PATCH 47/50] Optional host buffer bounce for no CUDA aware MPI --- Grid/communicator/SharedMemoryMPI.cc | 28 +++++++++++++++ Grid/stencil/Stencil.h | 28 ++++++++++++++- configure.ac | 12 +++++++ systems/Aurora/benchmarks/bench1.pbs | 32 ++++++++--------- systems/Aurora/benchmarks/gpu_tile_compact.sh | 34 ------------------- systems/Aurora/config-command | 2 +- systems/Aurora/sourceme.sh | 1 + 7 files changed, 84 insertions(+), 53 deletions(-) delete mode 100755 systems/Aurora/benchmarks/gpu_tile_compact.sh diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ec6a5003..2642c0bd 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -42,6 +42,11 @@ Author: Christoph Lehner #ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS +#else +#undef NUMA_PLACE_HOSTBUF +#ifdef NUMA_PLACE_HOSTBUF +#include +#endif #endif #include #endif @@ -537,7 +542,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI + printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes); +#ifdef NUMA_PLACE_HOSTBUF + int numa; + char *numa_name=(char *)getenv("MPI_BUF_NUMA"); + if(numa_name) { + page_size = sysconf(_SC_PAGESIZE); + numa = atoi(numa_name); + unsigned long page_count = bytes/page_size; + std::vector pages(pcount); + std::vector nodes(pcount,numa); + std::vector status(pcount,-1); + for(unsigned long p=0;pStencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. +#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -376,6 +377,23 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } +#else +#warning "Using COPY VIA HOST BUFFERS IN STENCIL" + for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } +#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iHostBufferFreeAll(); +#endif // run any checksums _grid->StencilBarrier(); - // run any checksums for(int i=0;iHostBufferMalloc(Packets[i].xbytes); + Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); + if ( Packets[i].do_send ) { + acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); + } + _grid->StencilSendToRecvFromBegin(MpiReqs, + Packets[i].host_send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].host_recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } + for(int i=0;iHostBufferFreeAll(); +*/ int ncomm =communicator_halo.size(); int commdir=dir%ncomm; @@ -421,28 +444,60 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorHostBufferMalloc(rbytes); + ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + CommsRequest_t srq; + srq.PacketType = InterNodeRecv; + srq.bytes = rbytes; + srq.req = rrq; + srq.host_buf = host_recv; + srq.device_buf = recv; + list.push_back(srq); +#endif off_node_bytes+=rbytes; - } + } else{ #ifdef NVLINK_GET void *shm = (void *) this->ShmBufferTranslate(from,xmit); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); #endif + } } if (dox) { // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; +#ifdef ACCELERATOR_AWARE_MPI ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); assert(ierr==0); list.push_back(xrq); +#else + std::cout << " send via host bounce "<HostBufferMalloc(xbytes); + acceleratorCopyFromDevice(xmit, host_xmit,xbytes); + ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + CommsRequest_t srq; + srq.PacketType = InterNodeXmit; + srq.bytes = xbytes; + srq.req = xrq; + srq.host_buf = host_xmit; + srq.device_buf = xmit; + list.push_back(srq); +#endif off_node_bytes+=xbytes; } else { #ifndef NVLINK_GET @@ -463,11 +518,25 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); +#else + // Wait individually and immediately copy receives to device + // Promition to Asynch copy and single wait is easy + MPI_Status status; + for(int r=0;rHostBufferFreeAll(); +#endif } void CartesianCommunicator::StencilBarrier(void) { diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 94e9741e..422be8aa 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -46,8 +46,22 @@ NAMESPACE_BEGIN(Grid); #if defined (GRID_COMMS_MPI3) typedef MPI_Comm Grid_MPI_Comm; +typedef MPI_Request MpiCommsRequest_t; +#ifdef ACCELERATOR_AWARE_MPI typedef MPI_Request CommsRequest_t; +#else +enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv }; +typedef struct { + PacketType_t PacketType; + void *host_buf; + void *device_buf; + unsigned long bytes; + MpiCommsRequest_t req; +} CommsRequest_t; +#endif + #else +typedef int MpiCommsRequest_t; typedef int CommsRequest_t; typedef int Grid_MPI_Comm; #endif diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 2642c0bd..c7668f8b 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -543,7 +543,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); - HostCommBuf= malloc(bytes); + HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host #ifdef NUMA_PLACE_HOSTBUF int numa; char *numa_name=(char *)getenv("MPI_BUF_NUMA"); diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index c7dcbac9..fb533212 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -467,8 +467,8 @@ public: send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); - std::vector fwd_req; - std::vector bwd_req; + std::vector fwd_req; + std::vector bwd_req; int words = buffer_size; int bytes = words * sizeof(vobj); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a768f344..2a478d13 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -368,7 +368,6 @@ public: // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. -#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -377,23 +376,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } -#else -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } -#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iHostBufferFreeAll(); -#endif // run any checksums _grid->StencilBarrier(); for(int i=0;i Date: Thu, 30 Jan 2025 16:36:46 +0000 Subject: [PATCH 49/50] Significantly better performance on Aurora without using pipeline mode --- Grid/communicator/Communicator_base.h | 6 + Grid/communicator/Communicator_mpi3.cc | 282 +++++++++++++----- Grid/communicator/Communicator_none.cc | 9 + Grid/communicator/SharedMemoryMPI.cc | 19 +- .../WilsonFermion5DImplementation.h | 28 +- Grid/stencil/Stencil.h | 8 + Grid/threads/Accelerator.h | 13 +- Makefile.am | 2 +- configure.ac | 5 +- systems/Aurora/benchmarks/bench2.pbs | 16 +- systems/Aurora/benchmarks/gpu_tile.sh | 14 +- systems/Aurora/config-command | 5 +- 12 files changed, 306 insertions(+), 101 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 0da7dc22..85659b3d 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -186,6 +186,12 @@ public: int recv_from_rank,int do_recv, int bytes,int dir); + double StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int do_xmit, + void *recv, + int recv_from_rank,int do_recv, + int xbytes,int rbytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int do_xmit, diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index d269f933..6b6c9dec 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -391,42 +391,131 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int bytes,int dir) { std::vector list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); + offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } -#undef NVLINK_GET // Define to use get instead of put DMA -double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + +#ifdef ACCELERATOR_AWARE_MPI +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + return 0.0; // Do nothing -- no preparation required +} +double CartesianCommunicator::StencilSendToRecvFromBegin(int list_idx, + std::vector &list, void *xmit, int dest,int dox, void *recv, int from,int dor, int xbytes,int rbytes,int dir) +{ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + if ( dor ) { + if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+from*32; + ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); + assert(ierr==0); + list.push_back(rrq); + off_node_bytes+=rbytes; + } + } + + if (dox) { + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + list.push_back(xrq); + off_node_bytes+=xbytes; + } else { + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } + } + return off_node_bytes; +} + +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) +{ + int nreq=list.size(); + + acceleratorCopySynchronise(); + + if (nreq==0) return; + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + list.resize(0); +} + +#else /* NOT ... ACCELERATOR_AWARE_MPI */ +/////////////////////////////////////////// +// Pipeline mode through host memory +/////////////////////////////////////////// + /* + * In prepare (phase 1): + * PHASE 1: (prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * - post device - device transfers + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + * PHASE 3: (Complete) + * - MPI_waitall + * - host-device transfers + * + ********************************* + * NB could split this further: + *-------------------------------- + * PHASE 1: (Prepare) + * - post MPI receive buffers asynch + * - post device - host send buffer transfer asynch + * PHASE 2: (BeginInterNode) + * - complete all copies + * - post MPI send asynch + * PHASE 3: (BeginIntraNode) + * - post device - device transfers + * PHASE 4: (Complete) + * - MPI_waitall + * - host-device transfers asynch + * - (complete all copies) + */ +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) { /* * Bring sequence from Stencil.h down to lower level. * Assume using XeLink is ok -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - // Introduce a host buffer with a cheap slab allocator and zero cost wipe all - Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } - for(int i=0;iHostBufferFreeAll(); -*/ + */ int ncomm =communicator_halo.size(); int commdir=dir%ncomm; @@ -447,14 +536,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorHostBufferMalloc(rbytes); ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); assert(ierr==0); @@ -465,79 +555,137 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(from,xmit); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); -#endif } } if (dox) { - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32; -#ifdef ACCELERATOR_AWARE_MPI - ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); - assert(ierr==0); - list.push_back(xrq); -#else - std::cout << " send via host bounce "<HostBufferMalloc(xbytes); - acceleratorCopyFromDevice(xmit, host_xmit,xbytes); - ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); - assert(ierr==0); + acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch + + // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + // assert(ierr==0); + // off_node_bytes+=xbytes; + CommsRequest_t srq; srq.PacketType = InterNodeXmit; srq.bytes = xbytes; - srq.req = xrq; + // srq.req = xrq; srq.host_buf = host_xmit; srq.device_buf = xmit; list.push_back(srq); -#endif - off_node_bytes+=xbytes; + } else { -#ifndef NVLINK_GET void *shm = (void *) this->ShmBufferTranslate(dest,recv); assert(shm!=NULL); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); -#endif - } } return off_node_bytes; } + +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int dest,int dox, + void *recv, + int from,int dor, + int xbytes,int rbytes,int dir) +{ + int ncomm =communicator_halo.size(); + int commdir=dir%ncomm; + + MPI_Request xrq; + MPI_Request rrq; + + int ierr; + int gdest = ShmRanks[dest]; + int gfrom = ShmRanks[from]; + int gme = ShmRanks[_processor]; + + assert(dest != _processor); + assert(from != _processor); + assert(gme == ShmRank); + double off_node_bytes=0.0; + int tag; + + void * host_xmit = NULL; + + //////////////////////////////// + // Receives already posted + // Copies already started + //////////////////////////////// + /* + * PHASE 2: (Begin) + * - complete all copies + * - post MPI send asynch + */ + acceleratorCopySynchronise(); + + static int printed; + if(!printed && this->IsBoss() ) { + printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); + printed=1; + } + + if (dox) { + + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + // Find the send in the prepared list + int list_idx=-1; + for(int idx = 0; idx &list,int dir) { int nreq=list.size(); - acceleratorCopySynchronise(); - if (nreq==0) return; -#ifdef ACCELERATOR_AWARE_MPI std::vector status(nreq); - int ierr = MPI_Waitall(nreq,&list[0],&status[0]); - assert(ierr==0); - list.resize(0); -#else - // Wait individually and immediately copy receives to device - // Promition to Asynch copy and single wait is easy - MPI_Status status; + std::vector MpiRequests(nreq); + + for(int r=0;rHostBufferFreeAll(); -#endif + + acceleratorCopySynchronise(); // Complete all pending copy transfers + list.resize(0); // Delete the list + this->HostBufferFreeAll(); // Clean up the buffer allocs } +#endif +//////////////////////////////////////////// +// END PIPELINE MODE / NO CUDA AWARE MPI +//////////////////////////////////////////// + void CartesianCommunicator::StencilBarrier(void) { MPI_Barrier (ShmComm); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 7e7dfac8..8e6206ef 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -132,6 +132,15 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, { return 2.0*bytes; } +double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, + void *xmit, + int xmit_to_rank,int dox, + void *recv, + int recv_from_rank,int dor, + int xbytes,int rbytes, int dir) +{ + return xbytes+rbytes; +} double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank,int dox, diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index c7668f8b..ce11714f 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -43,8 +43,8 @@ Author: Christoph Lehner #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS #else -#undef NUMA_PLACE_HOSTBUF -#ifdef NUMA_PLACE_HOSTBUF +#ifdef HAVE_NUMAIF_H + #warning " Using NUMAIF " #include #endif #endif @@ -544,18 +544,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host -#ifdef NUMA_PLACE_HOSTBUF +#ifdef HAVE_NUMAIF_H + #warning "Moving host buffers to specific NUMA domain" int numa; char *numa_name=(char *)getenv("MPI_BUF_NUMA"); if(numa_name) { - page_size = sysconf(_SC_PAGESIZE); + unsigned long page_size = sysconf(_SC_PAGESIZE); numa = atoi(numa_name); unsigned long page_count = bytes/page_size; - std::vector pages(pcount); - std::vector nodes(pcount,numa); - std::vector status(pcount,-1); + std::vector pages(page_count); + std::vector nodes(page_count,numa); + std::vector status(page_count,-1); for(unsigned long p=0;p::DhopInternalOverlappedComms(StencilImpl & st, // std::cout << " WilsonFermion5D Communicate Begin " < > requests; - auto id=traceStart("Communicate overlapped"); - st.CommunicateBegin(requests); - +#ifndef GRID_ACCELERATED ///////////////////////////// // Overlap with comms ///////////////////////////// - { - // std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, GRID_TRACE("DhopInterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); } - + +#ifdef GRID_ACCELERATED + ///////////////////////////// + // Overlap with comms -- on GPU the interior kernel call is nonblocking + ///////////////////////////// + st.CommunicateBegin(requests); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms +#endif + + ///////////////////////////// // Complete comms ///////////////////////////// // std::cout << " WilsonFermion5D Comms Complete " <StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. + for(int i=0;iStencilSendToRecvFromPrepare(MpiReqs, + Packets[i].send_buf, + Packets[i].to_rank,Packets[i].do_send, + Packets[i].recv_buf, + Packets[i].from_rank,Packets[i].do_recv, + Packets[i].xbytes,Packets[i].rbytes,i); + } for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index dc68fd2d..2862d087 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -327,7 +327,10 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } + inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} +inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); } +inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} @@ -465,8 +468,7 @@ inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} -//inline void acceleratorCopySynchronise(void) { } + inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch @@ -483,6 +485,13 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize #endif +inline void acceleratorPin(void *ptr,unsigned long bytes) +{ +#ifdef GRID_SYCL + sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context()); +#endif +} + ////////////////////////////////////////////// // Common on all GPU targets ////////////////////////////////////////////// diff --git a/Makefile.am b/Makefile.am index d2a1a326..9addcbf5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ # additional include paths necessary to compile the C++ library -SUBDIRS = Grid HMC benchmarks tests examples +SUBDIRS = Grid benchmarks tests examples HMC include $(top_srcdir)/doxygen.inc diff --git a/configure.ac b/configure.ac index 0b71b834..e4b553bf 100644 --- a/configure.ac +++ b/configure.ac @@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(execinfo.h) +AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -245,9 +246,11 @@ AC_ARG_ENABLE([accelerator-aware-mpi], [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) +# Force accelerator CSHIFT now +AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) + case ${ac_ACCELERATOR_AWARE_MPI} in yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; *);; esac diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index 81b3128d..aebed04e 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -27,10 +27,22 @@ export MPICH_OFI_NIC_POLICY=GPU #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +# +# Local vol 16.16.16.32 +# + +#VOL=32.64.64.96 + +for VOL in 32.32.32.96 32.64.64.96 +do +for AT in 32 +do CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 " + ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \ + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap " echo $CMD $CMD +done +done diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index ddb25c5e..ef64299c 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -5,11 +5,11 @@ #export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ); -export NUMA_MMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); +export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 ); export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} -export NUMAM=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} unset EnableWalkerPartition @@ -19,17 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 +export MPI_BUF_NUMA=$NUMAH + echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " if [ $PALS_RANKID = "0" ] then - numactl -m $NUMAM -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" -# numactl -m $NUMAM -N $NUMAP "$@" + numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" +# numactl -p $NUMAP -N $NUMAP "$@" else - numactl -m $NUMAM -N $NUMAP "$@" + numactl -p $NUMAP -N $NUMAP "$@" fi diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 64bef44b..6e5512ff 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,6 +1,7 @@ #Ahead of time compile for PVC -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " + +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" #JIT compile #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " From 8cf809e231e1545e4834765af3da2f2bbb6bd11f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 31 Jan 2025 16:14:45 +0000 Subject: [PATCH 50/50] Best results on Aurora so far --- Grid/communicator/Communicator_mpi3.cc | 55 ++++++++++++++----- Grid/communicator/SharedMemoryMPI.cc | 5 ++ .../WilsonFermion5DImplementation.h | 6 +- Grid/stencil/Stencil.h | 2 +- Grid/threads/Accelerator.h | 28 ++++++++++ benchmarks/Benchmark_dwf_fp32.cc | 2 +- systems/Aurora/benchmarks/gpu_tile.sh | 4 +- 7 files changed, 82 insertions(+), 20 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 6b6c9dec..7dc706df 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -408,8 +408,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector &list, +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, int dest,int dox, void *recv, @@ -470,6 +469,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorStencilBarrier(); } #else /* NOT ... ACCELERATOR_AWARE_MPI */ @@ -481,10 +481,10 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorHostBufferMalloc(xbytes); @@ -577,11 +579,30 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vectorShmBufferTranslate(dest,recv); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + host_xmit = this->HostBufferMalloc(xbytes); + const int chunks=1; + for(int n=0;nIsBoss() ) { - printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); - printed=1; - } + // static int printed; + // if((printed<8) && this->IsBoss() ) { + // printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes); + // printed++; + // } if (dox) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { +#ifdef DEVICE_TO_HOST_CONCURRENT tag= dir+_processor*32; // Find the send in the prepared list int list_idx=-1; @@ -652,7 +673,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } } return off_node_bytes; } @@ -680,6 +706,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vectorHostBufferFreeAll(); // Clean up the buffer allocs + this->StencilBarrier(); } #endif //////////////////////////////////////////// diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index ce11714f..dc22aee0 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -543,6 +543,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// #ifndef ACCELERATOR_AWARE_MPI printf("Host buffer allocate for GPU non-aware MPI\n"); +#if 0 + HostCommBuf= acceleratorAllocHost(bytes); +#else HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host #ifdef HAVE_NUMAIF_H #warning "Moving host buffers to specific NUMA domain" @@ -569,6 +572,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) } #endif acceleratorPin(HostCommBuf,bytes); +#endif + #endif ShmCommBuf = acceleratorAllocDevice(bytes); if (ShmCommBuf == (void *)NULL ) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 14132cef..3d4e5cc5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -332,7 +332,8 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // std::cout << " WilsonFermion5D Communicate Begin " < > requests; -#ifndef GRID_ACCELERATED + +#if 1 ///////////////////////////// // Overlap with comms ///////////////////////////// @@ -352,7 +353,8 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); } -#ifdef GRID_ACCELERATED + //ifdef GRID_ACCELERATED +#if 0 ///////////////////////////// // Overlap with comms -- on GPU the interior kernel call is nonblocking ///////////////////////////// diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 2de50e9c..1142891a 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -376,6 +376,7 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } + acceleratorCopySynchronise(); for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -401,7 +402,6 @@ public: else DslashLogFull(); // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); - _grid->StencilBarrier(); for(int i=0;iwait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; +inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; +inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; @@ -441,6 +456,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) } \ } +inline void *acceleratorAllocHost(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMallocHost((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr); + } + return ptr; +}; inline void *acceleratorAllocShared(size_t bytes) { void *ptr=NULL; @@ -464,6 +489,7 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; +inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} @@ -546,8 +572,10 @@ inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H +inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; +inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; #else diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ce4fcfab..cbe1ee23 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -52,7 +52,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - int Ls=16; + int Ls=8; for(int i=0;i> Ls; diff --git a/systems/Aurora/benchmarks/gpu_tile.sh b/systems/Aurora/benchmarks/gpu_tile.sh index ef64299c..a622ba3e 100755 --- a/systems/Aurora/benchmarks/gpu_tile.sh +++ b/systems/Aurora/benchmarks/gpu_tile.sh @@ -19,12 +19,12 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:7 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -export MPI_BUF_NUMA=$NUMAH +#export MPI_BUF_NUMA=$NUMAH echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "