mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Crusher updates
This commit is contained in:
parent
239afb18fb
commit
136d843ce7
@ -53,6 +53,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
MultiShiftFunction PowerQuarter;
|
||||
MultiShiftFunction PowerNegQuarter;
|
||||
|
||||
MultiShiftFunction MDPowerNegHalf;
|
||||
MultiShiftFunction MDPowerQuarter;
|
||||
|
||||
private:
|
||||
|
||||
FermionOperator<Impl> & NumOp;// the basic operator
|
||||
@ -81,11 +84,13 @@ NAMESPACE_BEGIN(Grid);
|
||||
remez.generateApprox(param.degree,1,2);
|
||||
PowerHalf.Init(remez,param.tolerance,false);
|
||||
PowerNegHalf.Init(remez,param.tolerance,true);
|
||||
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
||||
|
||||
// MdagM^(+- 1/4)
|
||||
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
||||
remez.generateApprox(param.degree,1,4);
|
||||
PowerQuarter.Init(remez,param.tolerance,false);
|
||||
MDPowerQuarter.Init(remez,param.mdtolerance,false);
|
||||
PowerNegQuarter.Init(remez,param.tolerance,true);
|
||||
};
|
||||
|
||||
@ -226,8 +231,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
|
||||
|
||||
const int n_f = PowerNegHalf.poles.size();
|
||||
const int n_pv = PowerQuarter.poles.size();
|
||||
const int n_f = MDPowerNegHalf.poles.size();
|
||||
const int n_pv = MDPowerQuarter.poles.size();
|
||||
|
||||
std::vector<FermionField> MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid());
|
||||
std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
|
||||
@ -246,8 +251,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
SchurDifferentiableOperator<Impl> VdagV(NumOp);
|
||||
SchurDifferentiableOperator<Impl> MdagM(DenOp);
|
||||
|
||||
ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
|
||||
ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
|
||||
ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
|
||||
ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);
|
||||
|
||||
msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
|
||||
msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
|
||||
@ -266,7 +271,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//(1)
|
||||
for(int k=0;k<n_f;k++){
|
||||
ak = PowerNegHalf.residues[k];
|
||||
ak = MDPowerNegHalf.residues[k];
|
||||
MdagM.Mpc(MfMpvPhi_k[k],Y);
|
||||
MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y ); dSdU=dSdU+ak*tmp;
|
||||
MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] ); dSdU=dSdU+ak*tmp;
|
||||
@ -276,7 +281,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
//(3)
|
||||
for(int k=0;k<n_pv;k++){
|
||||
|
||||
ak = PowerQuarter.residues[k];
|
||||
ak = MDPowerQuarter.residues[k];
|
||||
|
||||
VdagV.Mpc(MpvPhi_k[k],Y);
|
||||
VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
|
||||
|
@ -75,16 +75,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
remez.generateApprox(param.degree,1,2);
|
||||
PowerHalf.Init(remez,param.tolerance,false);
|
||||
PowerNegHalf.Init(remez,param.tolerance,true);
|
||||
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
||||
|
||||
// MdagM^(+- 1/4)
|
||||
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
||||
remez.generateApprox(param.degree,1,4);
|
||||
PowerQuarter.Init(remez,param.tolerance,false);
|
||||
PowerNegQuarter.Init(remez,param.tolerance,true);
|
||||
|
||||
// Derive solves different tol
|
||||
MDPowerQuarter.Init(remez,param.mdtolerance,false);
|
||||
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
||||
PowerNegQuarter.Init(remez,param.tolerance,true);
|
||||
};
|
||||
|
||||
virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
|
||||
|
@ -6,6 +6,13 @@ uint32_t accelerator_threads=2;
|
||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
||||
#define ENV_RANK_SLURM "SLURM_PROCID"
|
||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cudaDeviceProp *gpu_props;
|
||||
cudaStream_t copyStream;
|
||||
@ -17,12 +24,6 @@ void acceleratorInit(void)
|
||||
|
||||
char * localRankStr = NULL;
|
||||
int rank = 0, world_rank=0;
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
||||
#define ENV_RANK_SLURM "SLURM_PROCID"
|
||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
@ -119,10 +120,6 @@ void acceleratorInit(void)
|
||||
|
||||
char * localRankStr = NULL;
|
||||
int rank = 0, world_rank=0;
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||
// We extract the local rank initialization using an environment variable
|
||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||
{
|
||||
@ -134,8 +131,10 @@ void acceleratorInit(void)
|
||||
}
|
||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||
|
||||
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
||||
if ( world_rank == 0 )
|
||||
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
||||
size_t totalDeviceMem=0;
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
|
||||
@ -208,10 +207,7 @@ void acceleratorInit(void)
|
||||
|
||||
char * localRankStr = NULL;
|
||||
int rank = 0, world_rank=0;
|
||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||
|
||||
// We extract the local rank initialization using an environment variable
|
||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||
{
|
||||
|
@ -102,7 +102,7 @@ int main(int argc, char **argv) {
|
||||
SFRp.hi = 30.0;
|
||||
SFRp.MaxIter = 10000;
|
||||
SFRp.tolerance= 1.0e-8;
|
||||
SFRp.mdtolerance= 1.0e-6;
|
||||
SFRp.mdtolerance= 1.0e-5;
|
||||
SFRp.degree = 16;
|
||||
SFRp.precision= 50;
|
||||
SFRp.BoundsCheckFreq=5;
|
||||
@ -112,7 +112,7 @@ int main(int argc, char **argv) {
|
||||
OFRp.hi = 30.0;
|
||||
OFRp.MaxIter = 10000;
|
||||
OFRp.tolerance= 1.0e-8;
|
||||
OFRp.mdtolerance= 1.0e-6;
|
||||
OFRp.mdtolerance= 1.0e-5;
|
||||
OFRp.degree = 16;
|
||||
OFRp.precision= 50;
|
||||
OFRp.BoundsCheckFreq=5;
|
||||
@ -162,15 +162,17 @@ int main(int argc, char **argv) {
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
|
||||
double StoppingCondition = 1e-8;
|
||||
double MDStoppingCondition = 1e-6;
|
||||
double MaxCGIterations = 30000;
|
||||
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
|
||||
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
|
||||
|
||||
////////////////////////////////////
|
||||
// Collect actions
|
||||
////////////////////////////////////
|
||||
ActionLevel<HMCWrapper::Field> Level1(1);
|
||||
ActionLevel<HMCWrapper::Field> Level2(4);
|
||||
ActionLevel<HMCWrapper::Field> Level3(6);
|
||||
ActionLevel<HMCWrapper::Field> Level3(8);
|
||||
|
||||
////////////////////////////////////
|
||||
// Strange action
|
||||
@ -226,7 +228,7 @@ int main(int argc, char **argv) {
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
|
||||
if(h!=0) {
|
||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
|
||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
|
||||
} else {
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
@ -241,7 +243,7 @@ int main(int argc, char **argv) {
|
||||
for(int h=0;h<nquo-1;h++){
|
||||
Level2.push_back(Quotients[h]);
|
||||
}
|
||||
Level1.push_back(Quotients[nquo-1]); // PV dirichlet fix on coarse timestep
|
||||
Level2.push_back(Quotients[nquo-1]);
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Gauge action
|
||||
|
@ -9,6 +9,7 @@
|
||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||
--disable-gparity \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
||||
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
|
||||
HIPFLAGS = --amdgpu-target=gfx90a
|
||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
||||
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
|
||||
|
||||
|
||||
|
@ -12,19 +12,21 @@
|
||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
source sourceme.sh
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 32.32.32.256 --mpi 1.1.1.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
echo $PARAMS
|
||||
echo working directory
|
||||
pwd
|
||||
|
||||
PARAMS=" --accelerator-threads 8 --grid 32.32.32.32 --mpi 1.1.1.1 --comms-sequential --shm 2048 --shm-mpi 0"
|
||||
srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
||||
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.1 --comms-sequential --shm 2048 --shm-mpi 0"
|
||||
srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
||||
|
||||
|
||||
|
@ -12,37 +12,30 @@
|
||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
source setup.sh
|
||||
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.256 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
for vol in 64.64.64.256 64.64.64.128 32.32.32.256 32.32.32.128
|
||||
do
|
||||
PARAMS=" --accelerator-threads 8 --grid $vol --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||
echo $PARAMS
|
||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.256.8node
|
||||
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.${vol}.8node.shm-mpi1
|
||||
done
|
||||
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||
echo $PARAMS
|
||||
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node
|
||||
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
echo $PARAMS
|
||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||
echo $PARAMS
|
||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
|
||||
|
||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
echo $PARAMS
|
||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
|
||||
|
||||
|
||||
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
module load PrgEnv-gnu
|
||||
module load rocm/4.5.0
|
||||
module load rocm/5.1.0
|
||||
module load cray-mpich/8.1.15
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
#module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||
#Hack for lib
|
||||
export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
|
||||
|
Loading…
Reference in New Issue
Block a user