mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 21:25:56 +01:00
NUMA improvements worth preserving from AMD EPYC tests
This commit is contained in:
parent
7b0237b081
commit
40e119c61c
@ -60,16 +60,16 @@ int main (int argc, char ** argv)
|
|||||||
for(int lat=8;lat<=lmax;lat+=8){
|
for(int lat=8;lat<=lmax;lat+=8){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
uint64_t Nloop=NLOOP;
|
uint64_t Nloop=NLOOP;
|
||||||
|
|
||||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
|
|
||||||
LatticeVec z(&Grid); random(pRNG,z);
|
LatticeVec z(&Grid);// random(pRNG,z);
|
||||||
LatticeVec x(&Grid); random(pRNG,x);
|
LatticeVec x(&Grid);// random(pRNG,x);
|
||||||
LatticeVec y(&Grid); random(pRNG,y);
|
LatticeVec y(&Grid);// random(pRNG,y);
|
||||||
double a=2.0;
|
double a=2.0;
|
||||||
|
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ int main (int argc, char ** argv)
|
|||||||
double time = (stop-start)/Nloop*1000;
|
double time = (stop-start)/Nloop*1000;
|
||||||
|
|
||||||
double flops=vol*Nvec*2;// mul,add
|
double flops=vol*Nvec*2;// mul,add
|
||||||
double bytes=3*vol*Nvec*sizeof(Real);
|
double bytes=3.0*vol*Nvec*sizeof(Real);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -97,14 +97,14 @@ int main (int argc, char ** argv)
|
|||||||
for(int lat=8;lat<=lmax;lat+=8){
|
for(int lat=8;lat<=lmax;lat+=8){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
|
|
||||||
LatticeVec z(&Grid); random(pRNG,z);
|
LatticeVec z(&Grid);// random(pRNG,z);
|
||||||
LatticeVec x(&Grid); random(pRNG,x);
|
LatticeVec x(&Grid);// random(pRNG,x);
|
||||||
LatticeVec y(&Grid); random(pRNG,y);
|
LatticeVec y(&Grid);// random(pRNG,y);
|
||||||
double a=2.0;
|
double a=2.0;
|
||||||
|
|
||||||
uint64_t Nloop=NLOOP;
|
uint64_t Nloop=NLOOP;
|
||||||
@ -119,7 +119,7 @@ int main (int argc, char ** argv)
|
|||||||
double time = (stop-start)/Nloop*1000;
|
double time = (stop-start)/Nloop*1000;
|
||||||
|
|
||||||
double flops=vol*Nvec*2;// mul,add
|
double flops=vol*Nvec*2;// mul,add
|
||||||
double bytes=3*vol*Nvec*sizeof(Real);
|
double bytes=3.0*vol*Nvec*sizeof(Real);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -133,16 +133,16 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
uint64_t Nloop=NLOOP;
|
uint64_t Nloop=NLOOP;
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
|
|
||||||
LatticeVec z(&Grid); random(pRNG,z);
|
LatticeVec z(&Grid);// random(pRNG,z);
|
||||||
LatticeVec x(&Grid); random(pRNG,x);
|
LatticeVec x(&Grid);// random(pRNG,x);
|
||||||
LatticeVec y(&Grid); random(pRNG,y);
|
LatticeVec y(&Grid);// random(pRNG,y);
|
||||||
RealD a=2.0;
|
RealD a=2.0;
|
||||||
|
|
||||||
|
|
||||||
@ -154,7 +154,7 @@ int main (int argc, char ** argv)
|
|||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
double time = (stop-start)/Nloop*1000;
|
double time = (stop-start)/Nloop*1000;
|
||||||
|
|
||||||
double bytes=2*vol*Nvec*sizeof(Real);
|
double bytes=2.0*vol*Nvec*sizeof(Real);
|
||||||
double flops=vol*Nvec*1;// mul
|
double flops=vol*Nvec*1;// mul
|
||||||
std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
|
||||||
|
|
||||||
@ -169,14 +169,14 @@ int main (int argc, char ** argv)
|
|||||||
for(int lat=8;lat<=lmax;lat+=8){
|
for(int lat=8;lat<=lmax;lat+=8){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
uint64_t Nloop=NLOOP;
|
uint64_t Nloop=NLOOP;
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
LatticeVec z(&Grid); random(pRNG,z);
|
LatticeVec z(&Grid);// random(pRNG,z);
|
||||||
LatticeVec x(&Grid); random(pRNG,x);
|
LatticeVec x(&Grid);// random(pRNG,x);
|
||||||
LatticeVec y(&Grid); random(pRNG,y);
|
LatticeVec y(&Grid);// random(pRNG,y);
|
||||||
RealD a=2.0;
|
RealD a=2.0;
|
||||||
Real nn;
|
Real nn;
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
@ -187,7 +187,7 @@ int main (int argc, char ** argv)
|
|||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
double time = (stop-start)/Nloop*1000;
|
double time = (stop-start)/Nloop*1000;
|
||||||
|
|
||||||
double bytes=vol*Nvec*sizeof(Real);
|
double bytes=1.0*vol*Nvec*sizeof(Real);
|
||||||
double flops=vol*Nvec*2;// mul,add
|
double flops=vol*Nvec*2;// mul,add
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
|
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
|
||||||
|
|
||||||
|
@ -195,7 +195,8 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
uint8_t *cp = (uint8_t *)ptr;
|
uint8_t *cp = (uint8_t *)ptr;
|
||||||
#pragma omp parallel for
|
// One touch per 4k page, static OMP loop to catch same loop order
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
for(size_type n=0;n<bytes;n+=4096){
|
for(size_type n=0;n<bytes;n+=4096){
|
||||||
cp[n]=0;
|
cp[n]=0;
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/ipc.h>
|
#include <sys/ipc.h>
|
||||||
#include <sys/shm.h>
|
#include <sys/shm.h>
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
//#include <zlib.h>
|
#include <zlib.h>
|
||||||
|
#include <numaif.h>
|
||||||
#ifndef SHM_HUGETLB
|
#ifndef SHM_HUGETLB
|
||||||
#define SHM_HUGETLB 04000
|
#define SHM_HUGETLB 04000
|
||||||
#endif
|
#endif
|
||||||
@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||||
assert(((uint64_t)ptr&0x3F)==0);
|
assert(((uint64_t)ptr&0x3F)==0);
|
||||||
|
|
||||||
|
int status;
|
||||||
|
int flags=MPOL_MF_MOVE;
|
||||||
|
#ifdef KNL
|
||||||
|
int nodes=1; // numa domain == MCDRAM
|
||||||
|
// Find out if in SNC2,SNC4 mode ?
|
||||||
|
#else
|
||||||
|
int nodes=r; // numa domain == MPI ID
|
||||||
|
#endif
|
||||||
|
unsigned long count=1;
|
||||||
|
for(uint64_t page=0;page<size;page+=4096){
|
||||||
|
void *pages = (void *) ( page + (uint64_t)ptr );
|
||||||
|
uint64_t *cow_it = (uint64_t *)pages; *cow_it = 1;
|
||||||
|
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
|
||||||
|
if (ierr && (page==0)) perror("numa relocate command failed");
|
||||||
|
}
|
||||||
|
|
||||||
ShmCommBufs[r] =ptr;
|
ShmCommBufs[r] =ptr;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user