From 7d077fe4930a678d04a3f1330470246719a60735 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 9 Nov 2023 13:58:44 -0500 Subject: [PATCH 1/7] Frontier compiel --- HMC/HMC2p1f_3GeV.cc | 226 ++++++++++++++++++++++++++++++++ systems/Frontier/config-command | 23 ++++ systems/Frontier/sourceme.sh | 13 ++ 3 files changed, 262 insertions(+) create mode 100644 HMC/HMC2p1f_3GeV.cc create mode 100644 systems/Frontier/config-command create mode 100644 systems/Frontier/sourceme.sh diff --git a/HMC/HMC2p1f_3GeV.cc b/HMC/HMC2p1f_3GeV.cc new file mode 100644 index 00000000..4bf088d7 --- /dev/null +++ b/HMC/HMC2p1f_3GeV.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Copyright (C) 2023 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +using namespace Grid; + +int main(int argc, char **argv) +{ + std::cout << std::setprecision(12); + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionD FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 24; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 0; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 20; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.saveSmeared = true; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + + ////////////////////////////////////////////// + + const int Ls = 12; + Real beta = 2.37; + Real light_mass = 0.0047; + Real strange_mass = 0.0186; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; // Scale factor one, Shamir + RealD c = 0.0; + + OneFlavourRationalParams OFRp; + OFRp.lo = 1.0e-2; + OFRp.hi = 64; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 14; + OFRp.precision= 40; + + std::vector hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeField Uhot(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + bool ApplySmearing = false; + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + CG, + CG, CG, + CG, CG, + OFRp, false); + + EOFA.is_smeared = ApplySmearing; + Level1.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + } + + for(int h=0;his_smeared = ApplySmearing; + Level1.push_back(Quotients[h]); + } + + ///////////////////////////////////////////////////////////// + // lnDetJacobianAction + ///////////////////////////////////////////////////////////// + double rho = 0.1; // smearing parameter + int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd + int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd + Smear_Stout Stout(rho); + SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); + JacobianAction Jacobian(&SmearingPolicy); + if( ApplySmearing ) Level1.push_back(&Jacobian); + std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; + + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + GaugeAction.is_smeared = ApplySmearing; + Level2.push_back(&GaugeAction); + + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << std::endl; + + + std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + TheHMC.Run(SmearingPolicy); // for smearing + + Grid_finalize(); +} // main + + + diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command new file mode 100644 index 00000000..60ff464c --- /dev/null +++ b/systems/Frontier/config-command @@ -0,0 +1,23 @@ +CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` +../../configure --enable-comms=mpi-auto \ +--with-lime=$CLIME \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-tracing=timer \ +--enable-accelerator=hip \ +--enable-gen-simd-width=64 \ +--disable-gparity \ +--disable-fermion-reps \ +--enable-simd=GPU \ +--enable-accelerator-cshift \ +--with-gmp=$OLCF_GMP_ROOT \ +--with-fftw=$FFTW_DIR/.. \ +--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ +--disable-fermion-reps \ +CXX=hipcc MPICXX=mpicxx \ +CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \ + LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " + + + + diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh new file mode 100644 index 00000000..987241b4 --- /dev/null +++ b/systems/Frontier/sourceme.sh @@ -0,0 +1,13 @@ +. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh +spack load c-lime +#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib +module load emacs +module load PrgEnv-gnu +module load rocm +module load cray-mpich/8.1.23 +module load gmp +module load cray-fftw +module load craype-accel-amd-gfx90a +export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH +#Hack for lib +#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH From b77a9b8947a3f3d871a39f9c9fe836be18f9285d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Nov 2023 14:31:51 -0500 Subject: [PATCH 2/7] SDDC compiles starting --- systems/SDCC-A100/config-command | 17 +++++++++++++++++ systems/SDCC-A100/sourceme.sh | 2 ++ systems/SDCC-ICE/config-command | 14 ++++++++++++++ systems/SDCC-ICE/sourceme.sh | 1 + 4 files changed, 34 insertions(+) create mode 100644 systems/SDCC-A100/config-command create mode 100644 systems/SDCC-A100/sourceme.sh create mode 100644 systems/SDCC-ICE/config-command create mode 100644 systems/SDCC-ICE/sourceme.sh diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command new file mode 100644 index 00000000..cb773e7a --- /dev/null +++ b/systems/SDCC-A100/config-command @@ -0,0 +1,17 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-accelerator=cuda \ +--enable-gen-simd-width=64 \ +--enable-simd=GPU \ +--enable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=nvcc \ +MPICXX=mpicxx \ +LDFLAGS="-cudart shared " \ +CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared" + + + diff --git a/systems/SDCC-A100/sourceme.sh b/systems/SDCC-A100/sourceme.sh new file mode 100644 index 00000000..2aa86b7e --- /dev/null +++ b/systems/SDCC-A100/sourceme.sh @@ -0,0 +1,2 @@ +module load cuda/12.2 +module load openmpi diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command new file mode 100644 index 00000000..28e560e3 --- /dev/null +++ b/systems/SDCC-ICE/config-command @@ -0,0 +1,14 @@ +../../configure \ +--enable-comms=mpi \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX2 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh new file mode 100644 index 00000000..a620dea5 --- /dev/null +++ b/systems/SDCC-ICE/sourceme.sh @@ -0,0 +1 @@ +module load openmpi From 14643c0aab28c0b78f2cff1718bb454ceacd95f6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 4 Dec 2023 15:45:57 -0500 Subject: [PATCH 3/7] SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) --- benchmarks/Benchmark_dwf_fp32.cc | 9 +++++++ systems/SDCC-A100/bench.slurm | 42 ++++++++++++++++++++++++++++++++ systems/SDCC-A100/config-command | 2 +- systems/SDCC-ICE/bench.slurm | 31 +++++++++++++++++++++++ systems/SDCC-ICE/config-command | 11 ++++++--- systems/SDCC-ICE/sourceme.sh | 1 + 6 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 systems/SDCC-A100/bench.slurm create mode 100644 systems/SDCC-ICE/bench.slurm diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ae7cabec..37287595 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) GaugeField Umu(UGrid); GaugeField UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; @@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet) if(( n2e>1.0e-4) ) { std::cout<Barrier(); + std::cout<Barrier(); exit(-1); } assert (n2e< 1.0e-4 ); diff --git a/systems/SDCC-A100/bench.slurm b/systems/SDCC-A100/bench.slurm new file mode 100644 index 00000000..04d1e1e2 --- /dev/null +++ b/systems/SDCC-A100/bench.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --partition csi +#SBATCH --time=00:10:00 +#SBATCH -A csigeneral +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --qos csi +#SBATCH --gres=gpu:4 + +source sourceme.sh + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export CUDA_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec \$* +EOF +chmod +x ./select_gpu + + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_MEMTYPE_CACHE=n + +export OMP_NUM_THREAD=8 +#srun -N1 -n1 nvidia-smi +#srun -N1 -n1 numactl -H > numa.txt +srun -N1 -n1 lstopo A100-topo.pdf + +# 4.35 TF/s +#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + +srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command index cb773e7a..26ad5377 100644 --- a/systems/SDCC-A100/config-command +++ b/systems/SDCC-A100/config-command @@ -5,7 +5,7 @@ --enable-accelerator=cuda \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ ---enable-accelerator-cshift \ +--disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ CXX=nvcc \ diff --git a/systems/SDCC-ICE/bench.slurm b/systems/SDCC-ICE/bench.slurm new file mode 100644 index 00000000..76beb828 --- /dev/null +++ b/systems/SDCC-ICE/bench.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:20:00 +#SBATCH -A lqcdtest +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=2 +#SBATCH --qos lqcd + +source sourceme.sh + +export OMP_NUM_THREAD=24 +#srun -N1 -n1 numactl -H > numa.txt +#srun -N1 -n1 lstopo ice-topo.pdf + +cat << EOF > select_socket +#!/bin/bash +export NUM_MAP=(0 1) +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +exec \$* +EOF +chmod +x ./select_socket + +#for vol in 8.8.8.16 8.8.8.32 8.8.8.64 +#for vol in 8.8.16.16 8.8.16.32 8.8.16.64 +for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32 +do +srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out +srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out +done + diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command index 28e560e3..bc28c96d 100644 --- a/systems/SDCC-ICE/config-command +++ b/systems/SDCC-ICE/config-command @@ -1,13 +1,18 @@ ../../configure \ ---enable-comms=mpi \ +--enable-debug \ +--enable-comms=mpi-auto \ --enable-unified=yes \ --enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ --enable-accelerator=none \ ---enable-simd=AVX2 \ +--enable-simd=AVX512 \ --disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ -CXX=mpicxx \ +CXX=clang++ \ +MPICXX=mpicxx \ +LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \ +LIBS=-lhwloc \ CXXFLAGS="-std=c++17" diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh index a620dea5..6263063c 100644 --- a/systems/SDCC-ICE/sourceme.sh +++ b/systems/SDCC-ICE/sourceme.sh @@ -1 +1,2 @@ +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH module load openmpi From d1d98272638250f6ed579d760af0ca4f267004b0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 Dec 2023 12:11:03 -0500 Subject: [PATCH 4/7] Integrator logging update --- Grid/qcd/action/ActionBase.h | 16 +++++++++++ Grid/qcd/hmc/integrators/Integrator.h | 39 ++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index d34702c1..8acae81b 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -129,6 +129,22 @@ public: virtual ~Action(){} }; +template +class EmptyAction : public Action +{ + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions + virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action + virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative + + /////////////////////////////// + // Logging + /////////////////////////////// + virtual std::string action_name() { return std::string("Level Force Log"); }; + virtual std::string LogParameters() { return std::string("No parameters");}; +}; + + + NAMESPACE_END(Grid); #endif // ACTION_BASE_H diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 4dd5a634..f3c728fc 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -86,7 +86,8 @@ public: MomentumFilterBase const* MomFilter; const ActionSet as; - + ActionSet LevelForces; + //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default static MomentumFilterBase const* getDefaultMomFilter(){ static MomentumFilterNone filter; @@ -123,7 +124,8 @@ public: void update_P(MomentaField& Mom, Field& U, int level, double ep) { // input U actually not used in the fundamental case // Fundamental updates, include smearing - + + Field level_force(U.Grid()); level_force =Zero(); for (int a = 0; a < as[level].actions.size(); ++a) { double start_full = usecond(); @@ -144,7 +146,10 @@ public: MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; @@ -167,6 +172,16 @@ public: } + { + // total force + Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) + Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; + + Real force_max = std::sqrt(maxLocalNorm2(level_force)); + Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR; + LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); + } + // Force from the other representations as[level].apply(update_P_hireps, Representations, Mom, U, ep); @@ -216,6 +231,12 @@ public: //Default the momentum filter to "do-nothing" MomFilter = getDefaultMomFilter(); + + for (int level = 0; level < as.size(); ++level) { + ActionLevel Level; + Level.push_back(new EmptyAction); + LevelForces.push_back(Level); // does it copy by value or reference?? + } }; virtual ~Integrator() {} @@ -237,6 +258,8 @@ public: for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { as[level].actions.at(actionID)->reset_timer(); } + int actionID=0; + LevelForces[level].actions.at(actionID)->reset_timer(); } } void print_timer(void) @@ -298,6 +321,16 @@ public: <<" calls " << as[level].actions.at(actionID)->deriv_num << std::endl; } + int actionID=0; + std::cout << GridLogMessage + << LevelForces[level].actions.at(actionID)->action_name() + <<"["<deriv_max_average() + <<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average() + <<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average() + <<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average() + <<" calls " << LevelForces[level].actions.at(actionID)->deriv_num + << std::endl; } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } From 645e47c1ba526f5695d309dadfe089f68840fb34 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 8 Dec 2023 16:17:56 -0500 Subject: [PATCH 5/7] Config for Ampere Altra ARM --- systems/SDCC-ARM/config-command-mpi | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 systems/SDCC-ARM/config-command-mpi diff --git a/systems/SDCC-ARM/config-command-mpi b/systems/SDCC-ARM/config-command-mpi new file mode 100644 index 00000000..882cfe56 --- /dev/null +++ b/systems/SDCC-ARM/config-command-mpi @@ -0,0 +1,6 @@ +HDF=$HOME/paboyle/install + +LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF +#LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF + + From f48298ad4e58386b6eb4edbf0fe045a353bca6c7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Dec 2023 20:56:03 -0500 Subject: [PATCH 6/7] Bug fix --- Grid/qcd/hmc/integrators/Integrator.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index f3c728fc..385ff986 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -86,6 +86,7 @@ public: MomentumFilterBase const* MomFilter; const ActionSet as; + ActionSet LevelForces; //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default @@ -124,6 +125,8 @@ public: void update_P(MomentaField& Mom, Field& U, int level, double ep) { // input U actually not used in the fundamental case // Fundamental updates, include smearing + + assert(as.size()==LevelForces.size()); Field level_force(U.Grid()); level_force =Zero(); for (int a = 0; a < as[level].actions.size(); ++a) { @@ -233,9 +236,13 @@ public: MomFilter = getDefaultMomFilter(); for (int level = 0; level < as.size(); ++level) { - ActionLevel Level; - Level.push_back(new EmptyAction); - LevelForces.push_back(Level); // does it copy by value or reference?? + int multiplier = as.at(level).multiplier; + ActionLevel * Level = new ActionLevel(multiplier); + Level->push_back(new EmptyAction); + LevelForces.push_back(*Level); + // does it copy by value or reference?? + // - answer it copies by value, BUT the action level contains a reference that is NOT updated. + // Unsafe code in Guido's area } }; @@ -254,12 +261,14 @@ public: void reset_timer(void) { + assert(as.size()==LevelForces.size()); for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { as[level].actions.at(actionID)->reset_timer(); } int actionID=0; - LevelForces[level].actions.at(actionID)->reset_timer(); + assert(LevelForces.at(level).actions.size()==1); + LevelForces.at(level).actions.at(actionID)->reset_timer(); } } void print_timer(void) @@ -352,6 +361,13 @@ public: std::cout << as[level].actions.at(actionID)->LogParameters(); } } + std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <action_name() << "] ID: " << actionID << std::endl; + } + } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } @@ -433,6 +449,7 @@ public: RealD S(Field& U) { // here also U not used + assert(as.size()==LevelForces.size()); std::cout << GridLogIntegrator << "Integrator action\n"; RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom From 2a0d75bac215d5b34e39ce638dc6b2933de13fb5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Dec 2023 23:19:11 +0000 Subject: [PATCH 7/7] Aurora files --- systems/Aurora/benchmarks/bench.pbs | 54 +++++++++ systems/Aurora/benchmarks/bench2.pbs | 107 ++++++++++++++++++ systems/Aurora/benchmarks/gpu_tile_compact.sh | 65 +++++++++++ .../Aurora/benchmarks/gpu_tile_compact4.sh | 60 ++++++++++ systems/Aurora/config-command | 16 +++ systems/Aurora/proxies.sh | 9 ++ systems/Aurora/sourceme.sh | 12 ++ 7 files changed, 323 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench.pbs create mode 100644 systems/Aurora/benchmarks/bench2.pbs create mode 100755 systems/Aurora/benchmarks/gpu_tile_compact.sh create mode 100755 systems/Aurora/benchmarks/gpu_tile_compact4.sh create mode 100644 systems/Aurora/config-command create mode 100644 systems/Aurora/proxies.sh create mode 100644 systems/Aurora/sourceme.sh diff --git a/systems/Aurora/benchmarks/bench.pbs b/systems/Aurora/benchmarks/bench.pbs new file mode 100644 index 00000000..a12cde07 --- /dev/null +++ b/systems/Aurora/benchmarks/bench.pbs @@ -0,0 +1,54 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1 +#PBS -l walltime=01:00:00 +##PBS -A Aurora_Deployment +#PBS -A LatticeQCD_aesp + +HDIR=/home/paboyle/ +#module use /soft/testing/modulefiles/ +#module load intel-UMD23.05.25593.11/23.05.25593.11 +#module load tools/pti-gpu +#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH +#export PATH=$HDIR/tools/bin:$PATH + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +echo Jobid: $PBS_JOBID +echo Running on host `hostname` +echo Running on nodes `cat $PBS_NODEFILE` + +echo NODES +cat $PBS_NODEFILE +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}" +echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES" + + +#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ +# ./gpu_tile_compact.sh \ +# ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \ +# --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs new file mode 100644 index 00000000..6c3384dd --- /dev/null +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -0,0 +1,107 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +HDIR=/home/paboyle/ +#module use /soft/testing/modulefiles/ +#module load intel-UMD23.05.25593.11/23.05.25593.11 +#module load tools/pti-gpu +#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH +#export PATH=$HDIR/tools/bin:$PATH + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + + +echo Jobid: $PBS_JOBID +echo Running on host `hostname` +echo Running on nodes `cat $PBS_NODEFILE` + +echo NODES +cat $PBS_NODEFILE +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}" +echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES" + + +CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 1-to-1.comms.hmem0 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 1-to-1.comms.hmem1 + + +CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 2-to-2.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 2-to-2.comms.hmem0 + +CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 3-to-3.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 3-to-3.comms.hmem0 + + +CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact4.sh \ + ./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +$CMD | tee 4-to-4.comms.hmem1.nic-affinity + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +$CMD | tee 4-to-4.comms.hmem0.nic-affinity + + +CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 6-to-6.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 6-to-6.comms.hmem0 + + +CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 12-to-12.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 12-to-12.comms.hmem0 diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh new file mode 100755 index 00000000..4ea4b113 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +display_help() { + echo " Will map gpu tile to rank in compact and then round-robin fashion" + echo " Usage (only work for one node of ATS/PVC):" + echo " mpiexec --np N gpu_tile_compact.sh ./a.out" + echo + echo " Example 3 GPU of 2 Tiles with 7 Ranks:" + echo " 0 Rank 0.0" + echo " 1 Rank 0.1" + echo " 2 Rank 1.0" + echo " 3 Rank 1.1" + echo " 4 Rank 2.0" + echo " 5 Rank 2.1" + echo " 6 Rank 0.0" + echo + echo " Hacked together by apl@anl.gov, please contact if bug found" + exit 1 +} + +#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence. +#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices") +num_gpu=6 +num_tile=2 + +if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then + display_help +fi + + + +gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu )) +tile_id=$((PALS_LOCAL_RANKID % num_tile)) + +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +export GRID_MPICH_NIC_BIND=$NIC + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" + +if [ $PALS_LOCAL_RANKID = 0 ] +then +numactl -m $NUMA -N $NUMA "$@" +else +numactl -m $NUMA -N $NUMA "$@" +fi diff --git a/systems/Aurora/benchmarks/gpu_tile_compact4.sh b/systems/Aurora/benchmarks/gpu_tile_compact4.sh new file mode 100755 index 00000000..c157b853 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +display_help() { + echo " Will map gpu tile to rank in compact and then round-robin fashion" + echo " Usage (only work for one node of ATS/PVC):" + echo " mpiexec --np N gpu_tile_compact.sh ./a.out" + echo + echo " Example 3 GPU of 2 Tiles with 7 Ranks:" + echo " 0 Rank 0.0" + echo " 1 Rank 0.1" + echo " 2 Rank 1.0" + echo " 3 Rank 1.1" + echo " 4 Rank 2.0" + echo " 5 Rank 2.1" + echo " 6 Rank 0.0" + echo + echo " Hacked together by apl@anl.gov, please contact if bug found" + exit 1 +} + +#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence. +#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices") +num_gpu=6 +num_tile=2 + +if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then + display_help +fi + + + +gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu )) +tile_id=$((PALS_LOCAL_RANKID % num_tile)) + +export NUMA_MAP=(0 0 1 1 0 0 1 1 ) +export NIC_MAP=(0 1 4 5 0 1 4 5 ) +export GPU_MAP=(0 1 3 4 0 1 3 4 ) +export TILE_MAP=(0 0 0 0 1 1 1 1 ) +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +export GRID_MPICH_NIC_BIND=$NIC + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" + +numactl -m $NUMA -N $NUMA "$@" diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command new file mode 100644 index 00000000..e59ef515 --- /dev/null +++ b/systems/Aurora/config-command @@ -0,0 +1,16 @@ +TOOLS=$HOME/tools +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-accelerator-cshift \ + --disable-gparity \ + --disable-fermion-reps \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + diff --git a/systems/Aurora/proxies.sh b/systems/Aurora/proxies.sh new file mode 100644 index 00000000..ff0d5a5b --- /dev/null +++ b/systems/Aurora/proxies.sh @@ -0,0 +1,9 @@ +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh new file mode 100644 index 00000000..0c4e0a6e --- /dev/null +++ b/systems/Aurora/sourceme.sh @@ -0,0 +1,12 @@ +#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 + +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 +