1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Merge branch 'develop' into hisq_fat_links

This commit is contained in:
david clarke 2024-01-21 20:21:08 -07:00
commit c020b78e02
20 changed files with 780 additions and 1 deletions

View File

@ -129,6 +129,22 @@ public:
virtual ~Action(){}
};
template <class GaugeField >
class EmptyAction : public Action <GaugeField>
{
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() { return std::string("Level Force Log"); };
virtual std::string LogParameters() { return std::string("No parameters");};
};
NAMESPACE_END(Grid);
#endif // ACTION_BASE_H

View File

@ -87,6 +87,8 @@ public:
const ActionSet<Field, RepresentationPolicy> as;
ActionSet<Field,RepresentationPolicy> LevelForces;
//Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){
static MomentumFilterNone<MomentaField> filter;
@ -124,6 +126,9 @@ public:
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
assert(as.size()==LevelForces.size());
Field level_force(U.Grid()); level_force =Zero();
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
@ -144,7 +149,10 @@ public:
MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
// track the total
level_force = level_force+force;
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
@ -167,6 +175,16 @@ public:
}
{
// total force
Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
Real force_max = std::sqrt(maxLocalNorm2(level_force));
Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;
LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
}
// Force from the other representations
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
@ -216,6 +234,16 @@ public:
//Default the momentum filter to "do-nothing"
MomFilter = getDefaultMomFilter();
for (int level = 0; level < as.size(); ++level) {
int multiplier = as.at(level).multiplier;
ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
Level->push_back(new EmptyAction<Field>);
LevelForces.push_back(*Level);
// does it copy by value or reference??
// - answer it copies by value, BUT the action level contains a reference that is NOT updated.
// Unsafe code in Guido's area
}
};
virtual ~Integrator() {}
@ -233,10 +261,14 @@ public:
void reset_timer(void)
{
assert(as.size()==LevelForces.size());
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
as[level].actions.at(actionID)->reset_timer();
}
int actionID=0;
assert(LevelForces.at(level).actions.size()==1);
LevelForces.at(level).actions.at(actionID)->reset_timer();
}
}
void print_timer(void)
@ -298,6 +330,16 @@ public:
<<" calls " << as[level].actions.at(actionID)->deriv_num
<< std::endl;
}
int actionID=0;
std::cout << GridLogMessage
<< LevelForces[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] :\n\t\t "
<<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
<<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average()
<<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average()
<<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
<<" calls " << LevelForces[level].actions.at(actionID)->deriv_num
<< std::endl;
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@ -319,6 +361,13 @@ public:
std::cout << as[level].actions.at(actionID)->LogParameters();
}
}
std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
for (int level = 0; level < LevelForces.size(); ++level) {
std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
}
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@ -400,6 +449,7 @@ public:
RealD S(Field& U)
{ // here also U not used
assert(as.size()==LevelForces.size());
std::cout << GridLogIntegrator << "Integrator action\n";
RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom

226
HMC/HMC2p1f_3GeV.cc Normal file
View File

@ -0,0 +1,226 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Copyright (C) 2023
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
using namespace Grid;
int main(int argc, char **argv)
{
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
// here make a routine to print all the relevant information on the run
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
// Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 24;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("HotStart");
HMCparams.StartingType =std::string("ColdStart");
// HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 1;
CPparams.saveSmeared = true;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 12;
Real beta = 2.37;
Real light_mass = 0.0047;
Real strange_mass = 0.0186;
Real pv_mass = 1.0;
RealD M5 = 1.8;
RealD b = 1.0; // Scale factor one, Shamir
RealD c = 0.0;
OneFlavourRationalParams OFRp;
OFRp.lo = 1.0e-2;
OFRp.hi = 64;
OFRp.MaxIter = 10000;
OFRp.tolerance= 1.0e-10;
OFRp.degree = 14;
OFRp.precision= 40;
std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeField U(GridPtr);
LatticeGaugeField Uhot(GridPtr);
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
double StoppingCondition = 1e-10;
double MaxCGIterations = 30000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
bool ApplySmearing = false;
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
////////////////////////////////////
// Strange action
////////////////////////////////////
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
CG,
CG, CG,
CG, CG,
OFRp, false);
EOFA.is_smeared = ApplySmearing;
Level1.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h] << " / " << light_den[h]<< std::endl;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
}
for(int h=0;h<n_hasenbusch+1;h++){
Quotients[h]->is_smeared = ApplySmearing;
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// lnDetJacobianAction
/////////////////////////////////////////////////////////////
double rho = 0.1; // smearing parameter
int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd
int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
if( ApplySmearing ) Level1.push_back(&Jacobian);
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
GaugeAction.is_smeared = ApplySmearing;
Level2.push_back(&GaugeAction);
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
} // main

View File

@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
GaugeField Umu(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
// SU<Nc>::ColdConfiguration(Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet)
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
std::cout<<GridLogMessage << "RESULT" << std::endl;
// std::cout << result<<std::endl;
std::cout << norm2(result)<<std::endl;
std::cout<<GridLogMessage << "REF" << std::endl;
std::cout << norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "ERR" << std::endl;
std::cout << norm2(err)<<std::endl;
FGrid->Barrier();
exit(-1);
}
assert (n2e< 1.0e-4 );

View File

@ -0,0 +1,54 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=1
#PBS -l walltime=01:00:00
##PBS -A Aurora_Deployment
#PBS -A LatticeQCD_aesp
HDIR=/home/paboyle/
#module use /soft/testing/modulefiles/
#module load intel-UMD23.05.25593.11/23.05.25593.11
#module load tools/pti-gpu
#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
#export PATH=$HDIR/tools/bin:$PATH
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
echo Jobid: $PBS_JOBID
echo Running on host `hostname`
echo Running on nodes `cat $PBS_NODEFILE`
echo NODES
cat $PBS_NODEFILE
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS ))
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
# ./gpu_tile_compact.sh \
# ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
# --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD

View File

@ -0,0 +1,107 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=2
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
HDIR=/home/paboyle/
#module use /soft/testing/modulefiles/
#module load intel-UMD23.05.25593.11/23.05.25593.11
#module load tools/pti-gpu
#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
#export PATH=$HDIR/tools/bin:$PATH
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
echo Jobid: $PBS_JOBID
echo Running on host `hostname`
echo Running on nodes `cat $PBS_NODEFILE`
echo NODES
cat $PBS_NODEFILE
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS ))
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 1-to-1.comms.hmem0
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 1-to-1.comms.hmem1
CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 2-to-2.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 2-to-2.comms.hmem0
CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 3-to-3.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 3-to-3.comms.hmem0
CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact4.sh \
./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
$CMD | tee 4-to-4.comms.hmem1.nic-affinity
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
$CMD | tee 4-to-4.comms.hmem0.nic-affinity
CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 6-to-6.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 6-to-6.comms.hmem0
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
#$CMD | tee 12-to-12.comms.hmem1
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
#$CMD | tee 12-to-12.comms.hmem0

View File

@ -0,0 +1,65 @@
#!/bin/bash
display_help() {
echo " Will map gpu tile to rank in compact and then round-robin fashion"
echo " Usage (only work for one node of ATS/PVC):"
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by apl@anl.gov, please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_gpu=6
num_tile=2
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
display_help
fi
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
tile_id=$((PALS_LOCAL_RANKID % num_tile))
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export GRID_MPICH_NIC_BIND=$NIC
unset EnableWalkerPartition
export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
if [ $PALS_LOCAL_RANKID = 0 ]
then
numactl -m $NUMA -N $NUMA "$@"
else
numactl -m $NUMA -N $NUMA "$@"
fi

View File

@ -0,0 +1,60 @@
#!/bin/bash
display_help() {
echo " Will map gpu tile to rank in compact and then round-robin fashion"
echo " Usage (only work for one node of ATS/PVC):"
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by apl@anl.gov, please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_gpu=6
num_tile=2
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
display_help
fi
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
tile_id=$((PALS_LOCAL_RANKID % num_tile))
export NUMA_MAP=(0 0 1 1 0 0 1 1 )
export NIC_MAP=(0 1 4 5 0 1 4 5 )
export GPU_MAP=(0 1 3 4 0 1 3 4 )
export TILE_MAP=(0 0 0 0 1 1 1 1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export GRID_MPICH_NIC_BIND=$NIC
unset EnableWalkerPartition
export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND"
numactl -m $NUMA -N $NUMA "$@"

View File

@ -0,0 +1,16 @@
TOOLS=$HOME/tools
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"

View File

@ -0,0 +1,9 @@
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128
module use /soft/modulefiles
module load intel_compute_runtime/release/agama-devel-682.22

View File

@ -0,0 +1,12 @@
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
module use /soft/modulefiles
module load intel_compute_runtime/release/agama-devel-682.22
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128

View File

@ -0,0 +1,23 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "

View File

@ -0,0 +1,13 @@
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich/8.1.23
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH

View File

@ -0,0 +1,42 @@
#!/bin/bash
#SBATCH --partition csi
#SBATCH --time=00:10:00
#SBATCH -A csigeneral
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --qos csi
#SBATCH --gres=gpu:4
source sourceme.sh
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export CUDA_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec \$*
EOF
chmod +x ./select_gpu
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=no
export UCX_MEMTYPE_CACHE=n
export OMP_NUM_THREAD=8
#srun -N1 -n1 nvidia-smi
#srun -N1 -n1 numactl -H > numa.txt
srun -N1 -n1 lstopo A100-topo.pdf
# 4.35 TF/s
#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16
srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16

View File

@ -0,0 +1,17 @@
../../configure \
--enable-comms=mpi-auto \
--enable-unified=no \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
CXX=nvcc \
MPICXX=mpicxx \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"

View File

@ -0,0 +1,2 @@
module load cuda/12.2
module load openmpi

View File

@ -0,0 +1,6 @@
HDF=$HOME/paboyle/install
LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF
#LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF

View File

@ -0,0 +1,31 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:20:00
#SBATCH -A lqcdtest
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --qos lqcd
source sourceme.sh
export OMP_NUM_THREAD=24
#srun -N1 -n1 numactl -H > numa.txt
#srun -N1 -n1 lstopo ice-topo.pdf
cat << EOF > select_socket
#!/bin/bash
export NUM_MAP=(0 1)
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
exec \$*
EOF
chmod +x ./select_socket
#for vol in 8.8.8.16 8.8.8.32 8.8.8.64
#for vol in 8.8.16.16 8.8.16.32 8.8.16.64
for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32
do
srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out
srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out
done

View File

@ -0,0 +1,19 @@
../../configure \
--enable-debug \
--enable-comms=mpi-auto \
--enable-unified=yes \
--enable-shm=shmopen \
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
CXX=clang++ \
MPICXX=mpicxx \
LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \
LIBS=-lhwloc \
CXXFLAGS="-std=c++17"

View File

@ -0,0 +1,2 @@
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH
module load openmpi