mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Crusher updates
This commit is contained in:
parent
239afb18fb
commit
136d843ce7
@ -53,6 +53,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
MultiShiftFunction PowerQuarter;
|
MultiShiftFunction PowerQuarter;
|
||||||
MultiShiftFunction PowerNegQuarter;
|
MultiShiftFunction PowerNegQuarter;
|
||||||
|
|
||||||
|
MultiShiftFunction MDPowerNegHalf;
|
||||||
|
MultiShiftFunction MDPowerQuarter;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
FermionOperator<Impl> & NumOp;// the basic operator
|
FermionOperator<Impl> & NumOp;// the basic operator
|
||||||
@ -81,11 +84,13 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
remez.generateApprox(param.degree,1,2);
|
remez.generateApprox(param.degree,1,2);
|
||||||
PowerHalf.Init(remez,param.tolerance,false);
|
PowerHalf.Init(remez,param.tolerance,false);
|
||||||
PowerNegHalf.Init(remez,param.tolerance,true);
|
PowerNegHalf.Init(remez,param.tolerance,true);
|
||||||
|
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
||||||
|
|
||||||
// MdagM^(+- 1/4)
|
// MdagM^(+- 1/4)
|
||||||
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
||||||
remez.generateApprox(param.degree,1,4);
|
remez.generateApprox(param.degree,1,4);
|
||||||
PowerQuarter.Init(remez,param.tolerance,false);
|
PowerQuarter.Init(remez,param.tolerance,false);
|
||||||
|
MDPowerQuarter.Init(remez,param.mdtolerance,false);
|
||||||
PowerNegQuarter.Init(remez,param.tolerance,true);
|
PowerNegQuarter.Init(remez,param.tolerance,true);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -226,8 +231,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
|
virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
|
||||||
|
|
||||||
const int n_f = PowerNegHalf.poles.size();
|
const int n_f = MDPowerNegHalf.poles.size();
|
||||||
const int n_pv = PowerQuarter.poles.size();
|
const int n_pv = MDPowerQuarter.poles.size();
|
||||||
|
|
||||||
std::vector<FermionField> MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid());
|
std::vector<FermionField> MpvPhi_k (n_pv,NumOp.FermionRedBlackGrid());
|
||||||
std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
|
std::vector<FermionField> MpvMfMpvPhi_k(n_pv,NumOp.FermionRedBlackGrid());
|
||||||
@ -246,8 +251,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
SchurDifferentiableOperator<Impl> VdagV(NumOp);
|
SchurDifferentiableOperator<Impl> VdagV(NumOp);
|
||||||
SchurDifferentiableOperator<Impl> MdagM(DenOp);
|
SchurDifferentiableOperator<Impl> MdagM(DenOp);
|
||||||
|
|
||||||
ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,PowerQuarter);
|
ConjugateGradientMultiShift<FermionField> msCG_V(param.MaxIter,MDPowerQuarter);
|
||||||
ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,PowerNegHalf);
|
ConjugateGradientMultiShift<FermionField> msCG_M(param.MaxIter,MDPowerNegHalf);
|
||||||
|
|
||||||
msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
|
msCG_V(VdagV,PhiOdd,MpvPhi_k,MpvPhi);
|
||||||
msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
|
msCG_M(MdagM,MpvPhi,MfMpvPhi_k,MfMpvPhi);
|
||||||
@ -266,7 +271,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
//(1)
|
//(1)
|
||||||
for(int k=0;k<n_f;k++){
|
for(int k=0;k<n_f;k++){
|
||||||
ak = PowerNegHalf.residues[k];
|
ak = MDPowerNegHalf.residues[k];
|
||||||
MdagM.Mpc(MfMpvPhi_k[k],Y);
|
MdagM.Mpc(MfMpvPhi_k[k],Y);
|
||||||
MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y ); dSdU=dSdU+ak*tmp;
|
MdagM.MpcDagDeriv(tmp , MfMpvPhi_k[k], Y ); dSdU=dSdU+ak*tmp;
|
||||||
MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] ); dSdU=dSdU+ak*tmp;
|
MdagM.MpcDeriv(tmp , Y, MfMpvPhi_k[k] ); dSdU=dSdU+ak*tmp;
|
||||||
@ -276,7 +281,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//(3)
|
//(3)
|
||||||
for(int k=0;k<n_pv;k++){
|
for(int k=0;k<n_pv;k++){
|
||||||
|
|
||||||
ak = PowerQuarter.residues[k];
|
ak = MDPowerQuarter.residues[k];
|
||||||
|
|
||||||
VdagV.Mpc(MpvPhi_k[k],Y);
|
VdagV.Mpc(MpvPhi_k[k],Y);
|
||||||
VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
|
VdagV.MpcDagDeriv(tmp,MpvMfMpvPhi_k[k],Y); dSdU=dSdU+ak*tmp;
|
||||||
|
@ -75,16 +75,14 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
remez.generateApprox(param.degree,1,2);
|
remez.generateApprox(param.degree,1,2);
|
||||||
PowerHalf.Init(remez,param.tolerance,false);
|
PowerHalf.Init(remez,param.tolerance,false);
|
||||||
PowerNegHalf.Init(remez,param.tolerance,true);
|
PowerNegHalf.Init(remez,param.tolerance,true);
|
||||||
|
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
||||||
|
|
||||||
// MdagM^(+- 1/4)
|
// MdagM^(+- 1/4)
|
||||||
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/4)"<<std::endl;
|
||||||
remez.generateApprox(param.degree,1,4);
|
remez.generateApprox(param.degree,1,4);
|
||||||
PowerQuarter.Init(remez,param.tolerance,false);
|
PowerQuarter.Init(remez,param.tolerance,false);
|
||||||
PowerNegQuarter.Init(remez,param.tolerance,true);
|
|
||||||
|
|
||||||
// Derive solves different tol
|
|
||||||
MDPowerQuarter.Init(remez,param.mdtolerance,false);
|
MDPowerQuarter.Init(remez,param.mdtolerance,false);
|
||||||
MDPowerNegHalf.Init(remez,param.mdtolerance,true);
|
PowerNegQuarter.Init(remez,param.tolerance,true);
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
|
virtual std::string action_name(){return "OneFlavourRatioRationalPseudoFermionAction";}
|
||||||
|
@ -6,6 +6,13 @@ uint32_t accelerator_threads=2;
|
|||||||
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
uint32_t acceleratorThreads(void) {return accelerator_threads;};
|
||||||
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||||
|
|
||||||
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
||||||
|
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
||||||
|
#define ENV_RANK_SLURM "SLURM_PROCID"
|
||||||
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
|
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cudaDeviceProp *gpu_props;
|
cudaDeviceProp *gpu_props;
|
||||||
cudaStream_t copyStream;
|
cudaStream_t copyStream;
|
||||||
@ -17,12 +24,6 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
|
||||||
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
|
|
||||||
#define ENV_RANK_SLURM "SLURM_PROCID"
|
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
@ -119,10 +120,6 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
|
||||||
// We extract the local rank initialization using an environment variable
|
// We extract the local rank initialization using an environment variable
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
{
|
{
|
||||||
@ -134,8 +131,10 @@ void acceleratorInit(void)
|
|||||||
}
|
}
|
||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
if ( world_rank == 0 )
|
||||||
|
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
||||||
size_t totalDeviceMem=0;
|
size_t totalDeviceMem=0;
|
||||||
for (int i = 0; i < nDevices; i++) {
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
@ -208,10 +207,7 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
int rank = 0, world_rank=0;
|
int rank = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
|
||||||
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
|
|
||||||
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
|
|
||||||
// We extract the local rank initialization using an environment variable
|
// We extract the local rank initialization using an environment variable
|
||||||
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
|
||||||
{
|
{
|
||||||
|
@ -102,7 +102,7 @@ int main(int argc, char **argv) {
|
|||||||
SFRp.hi = 30.0;
|
SFRp.hi = 30.0;
|
||||||
SFRp.MaxIter = 10000;
|
SFRp.MaxIter = 10000;
|
||||||
SFRp.tolerance= 1.0e-8;
|
SFRp.tolerance= 1.0e-8;
|
||||||
SFRp.mdtolerance= 1.0e-6;
|
SFRp.mdtolerance= 1.0e-5;
|
||||||
SFRp.degree = 16;
|
SFRp.degree = 16;
|
||||||
SFRp.precision= 50;
|
SFRp.precision= 50;
|
||||||
SFRp.BoundsCheckFreq=5;
|
SFRp.BoundsCheckFreq=5;
|
||||||
@ -112,7 +112,7 @@ int main(int argc, char **argv) {
|
|||||||
OFRp.hi = 30.0;
|
OFRp.hi = 30.0;
|
||||||
OFRp.MaxIter = 10000;
|
OFRp.MaxIter = 10000;
|
||||||
OFRp.tolerance= 1.0e-8;
|
OFRp.tolerance= 1.0e-8;
|
||||||
OFRp.mdtolerance= 1.0e-6;
|
OFRp.mdtolerance= 1.0e-5;
|
||||||
OFRp.degree = 16;
|
OFRp.degree = 16;
|
||||||
OFRp.precision= 50;
|
OFRp.precision= 50;
|
||||||
OFRp.BoundsCheckFreq=5;
|
OFRp.BoundsCheckFreq=5;
|
||||||
@ -162,15 +162,17 @@ int main(int argc, char **argv) {
|
|||||||
FermionAction::ImplParams Params(boundary);
|
FermionAction::ImplParams Params(boundary);
|
||||||
|
|
||||||
double StoppingCondition = 1e-8;
|
double StoppingCondition = 1e-8;
|
||||||
|
double MDStoppingCondition = 1e-6;
|
||||||
double MaxCGIterations = 30000;
|
double MaxCGIterations = 30000;
|
||||||
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
|
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
|
||||||
|
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Collect actions
|
// Collect actions
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
ActionLevel<HMCWrapper::Field> Level1(1);
|
ActionLevel<HMCWrapper::Field> Level1(1);
|
||||||
ActionLevel<HMCWrapper::Field> Level2(4);
|
ActionLevel<HMCWrapper::Field> Level2(4);
|
||||||
ActionLevel<HMCWrapper::Field> Level3(6);
|
ActionLevel<HMCWrapper::Field> Level3(8);
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Strange action
|
// Strange action
|
||||||
@ -226,7 +228,7 @@ int main(int argc, char **argv) {
|
|||||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
|
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
|
||||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
|
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
|
||||||
if(h!=0) {
|
if(h!=0) {
|
||||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
|
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
|
||||||
} else {
|
} else {
|
||||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||||
@ -241,7 +243,7 @@ int main(int argc, char **argv) {
|
|||||||
for(int h=0;h<nquo-1;h++){
|
for(int h=0;h<nquo-1;h++){
|
||||||
Level2.push_back(Quotients[h]);
|
Level2.push_back(Quotients[h]);
|
||||||
}
|
}
|
||||||
Level1.push_back(Quotients[nquo-1]); // PV dirichlet fix on coarse timestep
|
Level2.push_back(Quotients[nquo-1]);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Gauge action
|
// Gauge action
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
||||||
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
|
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
|
||||||
HIPFLAGS = --amdgpu-target=gfx90a
|
|
||||||
|
|
||||||
|
@ -12,19 +12,21 @@
|
|||||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
||||||
|
|
||||||
DIR=.
|
DIR=.
|
||||||
module list
|
source sourceme.sh
|
||||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
|
||||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
|
||||||
export OMP_NUM_THREADS=1
|
export OMP_NUM_THREADS=1
|
||||||
|
|
||||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||||
|
|
||||||
PARAMS=" --accelerator-threads 16 --grid 32.32.32.256 --mpi 1.1.1.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
echo working directory
|
||||||
echo $PARAMS
|
pwd
|
||||||
|
|
||||||
|
PARAMS=" --accelerator-threads 8 --grid 32.32.32.32 --mpi 1.1.1.1 --comms-sequential --shm 2048 --shm-mpi 0"
|
||||||
|
srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||||
|
|
||||||
|
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.1 --comms-sequential --shm 2048 --shm-mpi 0"
|
||||||
srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
srun --gpus-per-task 1 -n8 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,37 +12,30 @@
|
|||||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
|
||||||
|
|
||||||
DIR=.
|
DIR=.
|
||||||
module list
|
source setup.sh
|
||||||
|
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
#export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||||
export OMP_NUM_THREADS=1
|
export OMP_NUM_THREADS=1
|
||||||
|
|
||||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||||
|
|
||||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.256 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
for vol in 64.64.64.256 64.64.64.128 32.32.32.256 32.32.32.128
|
||||||
|
do
|
||||||
|
PARAMS=" --accelerator-threads 8 --grid $vol --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||||
echo $PARAMS
|
echo $PARAMS
|
||||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.256.8node
|
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.${vol}.8node.shm-mpi1
|
||||||
|
done
|
||||||
|
|
||||||
|
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
||||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 1"
|
|
||||||
echo $PARAMS
|
echo $PARAMS
|
||||||
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node
|
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
|
||||||
|
|
||||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 4.4.4.1 --comms-overlap --shm 2048 --shm-mpi 0"
|
PARAMS=" --accelerator-threads 8 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||||
echo $PARAMS
|
echo $PARAMS
|
||||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_dwf_fp32 $PARAMS > dwf.64.64.64.32.8node.shm0
|
srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
|
||||||
|
|
||||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 1"
|
|
||||||
echo $PARAMS
|
|
||||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node
|
|
||||||
|
|
||||||
PARAMS=" --accelerator-threads 16 --grid 64.64.64.32 --mpi 2.2.2.8 --comms-overlap --shm 2048 --shm-mpi 0"
|
|
||||||
echo $PARAMS
|
|
||||||
#srun --gpus-per-task 1 -N8 -n64 ./benchmarks/Benchmark_ITT $PARAMS > itt.8node_shm0
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
module load PrgEnv-gnu
|
module load PrgEnv-gnu
|
||||||
module load rocm/4.5.0
|
module load rocm/5.1.0
|
||||||
|
module load cray-mpich/8.1.15
|
||||||
module load gmp
|
module load gmp
|
||||||
module load cray-fftw
|
#module load cray-fftw
|
||||||
module load craype-accel-amd-gfx90a
|
module load craype-accel-amd-gfx90a
|
||||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||||
|
#Hack for lib
|
||||||
|
export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
|
||||||
|
Loading…
x
Reference in New Issue
Block a user