1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-19 16:55:37 +01:00

Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1

This commit is contained in:
Peter Boyle 2023-10-05 16:57:59 -04:00
commit 3bc2da5321
12 changed files with 145 additions and 23 deletions

View File

@ -300,10 +300,10 @@ public:
const int Nsimd = CComplex::Nsimd();
int osites=pin.Grid()->oSites();
int gsites=pin.Grid()->gSites();
// int gsites=pin.Grid()->gSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
// for(int point=0;point<npoint;point++){
// conformable(A[point],pin);
@ -358,7 +358,7 @@ public:
void PopulateAdag(void)
{
for(int bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
@ -543,10 +543,13 @@ public:
}
// Only needed if nonhermitian
if ( ! hermitian )
if ( ! hermitian ) {
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
PopulateAdag();
}
// Need to write something to populate Adag from A
std::cout << GridLogMessage<<"ExchangeCoarseLinks "<<std::endl;
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
@ -556,6 +559,7 @@ public:
}
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
std::cout << "Exchange "<<p<<std::endl;
_A[p] = Cell.Exchange(_A[p]);
_Adag[p]= Cell.Exchange(_Adag[p]);
}

View File

@ -70,8 +70,8 @@ public:
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
int64_t _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
@ -214,7 +214,7 @@ public:
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
@ -222,7 +222,7 @@ public:
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {

View File

@ -360,7 +360,7 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
for(int64_t g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

View File

@ -432,7 +432,7 @@ public:
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
int64_t gidx;
int o_idx;
int i_idx;
int rank;

View File

@ -1054,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
Coordinate fcoor(nd);
Coordinate ccoor(nd);
for(int g=0;g<fg->gSites();g++){
for(int64_t g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor);
for(int d=0;d<nd;d++){

View File

@ -8,7 +8,7 @@ namespace Grid{
public:
template<class coor_t>
static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
int nd= dims.size();
coor.resize(nd);
for(int d=0;d<nd;d++){
@ -18,28 +18,45 @@ namespace Grid{
}
template<class coor_t>
static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
int nd=dims.size();
int stride=1;
index=0;
for(int d=0;d<nd;d++){
index = index+stride*coor[d];
index = index+(int64_t)stride*coor[d];
stride=stride*dims[d];
}
}
template<class coor_t>
static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
int64_t index64;
IndexFromCoor(coor,index64,dims);
assert(index64<2*1024*1024*1024LL);
index = (int) index64;
}
template<class coor_t>
static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
int nd=dims.size();
int stride=1;
index=0;
for(int d=nd-1;d>=0;d--){
index = index+stride*coor[d];
index = index+(int64_t)stride*coor[d];
stride=stride*dims[d];
}
}
template<class coor_t>
static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
int64_t index64;
IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
}
assert(index64<2*1024*1024*1024LL);
index = (int) index64;
}
template<class coor_t>
static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
int nd= dims.size();
coor.resize(nd);
for(int d=nd-1;d>=0;d--){

View File

@ -0,0 +1,43 @@
#!/bin/bash -l
#SBATCH --job-name=bench
##SBATCH --partition=small-g
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --gpus-per-node=8
#SBATCH --time=00:10:00
#SBATCH --account=phy157_dwf
#SBATCH --gpu-bind=none
#SBATCH --exclusive
#SBATCH --mem=0
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
root=$HOME/Frontier/Grid/systems/Frontier/
source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
done

View File

@ -0,0 +1,23 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "

13
systems/Frontier/mpiwrapper.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
lrank=$SLURM_LOCALID
lgpu=(0 1 2 3 7 6 5 4)
export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
$*

View File

@ -0,0 +1,13 @@
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich/8.1.23
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH

9
systems/Frontier/wrap.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
unset ROCR_VISIBLE_DEVICES
#rank=$SLURM_PROCID
#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
$@

View File

@ -305,10 +305,6 @@ int main (int argc, char ** argv)
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
// Standard CG
// result=Zero();
// CGfine(HermOpEO, src, result);
for(int l=0;l<los.size();l++){
RealD lo = los[l];
@ -348,6 +344,10 @@ int main (int argc, char ** argv)
}
}
// Standard CG
result=Zero();
CGfine(HermOpEO, src, result);
Grid_finalize();
return 0;