mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-22 09:42:02 +01:00
Compare commits
11 Commits
fix/HOST_N
...
1b07a194b3
Author | SHA1 | Date | |
---|---|---|---|
1b07a194b3 | |||
461cd045c6 | |||
fee65d7a75 | |||
31f9971dbf | |||
89c0519f83 | |||
2704b82084 | |||
cf8632bbac | |||
d224297972 | |||
a4d11a630f | |||
d87296f3e8 | |||
be94cf1c6f |
@ -281,12 +281,14 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
ComplexD nrm = rankInnerProduct(left,right);
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
// std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
|
// GridNormLog(real(nrm)); // Could log before and after global sum to distinguish local and MPI
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
|
// GridNormLog(real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -411,7 +411,7 @@ public:
|
|||||||
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
||||||
SeedFixedIntegers(seeds);
|
SeedFixedIntegers(seeds);
|
||||||
}
|
}
|
||||||
void SeedFixedIntegers(const std::vector<int> &seeds){
|
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
|
||||||
|
|
||||||
// Everyone generates the same seed_seq based on input seeds
|
// Everyone generates the same seed_seq based on input seeds
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
||||||
@ -428,7 +428,6 @@ public:
|
|||||||
// MT implementation does not implement fast discard even though
|
// MT implementation does not implement fast discard even though
|
||||||
// in principle this is possible
|
// in principle this is possible
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
#if 1
|
|
||||||
thread_for( lidx, _grid->lSites(), {
|
thread_for( lidx, _grid->lSites(), {
|
||||||
|
|
||||||
int gidx;
|
int gidx;
|
||||||
@ -449,29 +448,12 @@ public:
|
|||||||
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
int l_idx=generator_idx(o_idx,i_idx);
|
||||||
_generators[l_idx] = master_engine;
|
_generators[l_idx] = master_engine;
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
if ( britney ) {
|
||||||
});
|
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
|
||||||
#else
|
} else {
|
||||||
// Everybody loops over global volume.
|
|
||||||
thread_for( gidx, _grid->_gsites, {
|
|
||||||
|
|
||||||
// Where is it?
|
|
||||||
int rank;
|
|
||||||
int o_idx;
|
|
||||||
int i_idx;
|
|
||||||
|
|
||||||
Coordinate gcoor;
|
|
||||||
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
|
||||||
|
|
||||||
// If this is one of mine we take it
|
|
||||||
if( rank == _grid->ThisRank() ){
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
|
||||||
_generators[l_idx] = master_engine;
|
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Machine and thread decomposition dependent seeding is efficient
|
// Machine and thread decomposition dependent seeding is efficient
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <type_traits>
|
|
||||||
#if defined(GRID_CUDA)
|
#if defined(GRID_CUDA)
|
||||||
|
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
|
||||||
|
#if defined(GRID_SYCL)
|
||||||
|
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
|
{
|
||||||
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
|
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
|
||||||
|
vobj vobj_zero;
|
||||||
|
zeroit(vobj_zero);
|
||||||
|
for (int r = 0; r<rd; r++) {
|
||||||
|
mysum[r] = vobj_zero;
|
||||||
|
}
|
||||||
|
|
||||||
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
|
// autoView(Data_v, Data, AcceleratorRead);
|
||||||
|
|
||||||
|
//prepare reduction buffer
|
||||||
|
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
||||||
|
|
||||||
|
int n = s / e2;
|
||||||
|
int b = s % e2;
|
||||||
|
int so=r*ostride; // base offset for start of plane
|
||||||
|
int ss= so+n*stride+b;
|
||||||
|
|
||||||
|
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||||
|
Reduction,
|
||||||
|
[=](cl::sycl::id<1> item, auto &sum) {
|
||||||
|
auto s = item[0];
|
||||||
|
sum += rb_p[r*subvol_size+s];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
for (int r = 0; r < rd; r++) {
|
||||||
|
lvSum[r] = mysum[r];
|
||||||
|
}
|
||||||
|
free(mysum,*theGridAccelerator);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
@ -106,7 +159,11 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
buf[ss] = dat[ss*words+w];
|
buf[ss] = dat[ss*words+w];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#elif defined(GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
|
|
||||||
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#elif defined (GRID_SYCL)
|
||||||
|
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
|
||||||
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
size_t subvol_size = e1*e2;
|
|
||||||
|
|
||||||
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
|
|
||||||
vobj vobj_zero;
|
|
||||||
zeroit(vobj_zero);
|
|
||||||
|
|
||||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
|
||||||
|
|
||||||
autoView(Data_v, Data, AcceleratorRead);
|
|
||||||
|
|
||||||
//prepare reduction buffer
|
|
||||||
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
|
|
||||||
|
|
||||||
int n = s / e2;
|
|
||||||
int b = s % e2;
|
|
||||||
int so=r*ostride; // base offset for start of plane
|
|
||||||
int ss= so+n*stride+b;
|
|
||||||
|
|
||||||
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
|
||||||
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
|
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
|
||||||
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
|
|
||||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
|
||||||
Reduction,
|
|
||||||
[=](cl::sycl::id<1> item, auto &sum) {
|
|
||||||
auto s = item[0];
|
|
||||||
sum += rb_p[r*subvol_size+s];
|
|
||||||
});
|
|
||||||
});
|
|
||||||
theGridAccelerator->wait();
|
|
||||||
lvSum[r] = mysum[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
free(mysum,*theGridAccelerator);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
|||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
|
||||||
#elif defined(GRID_SYCL)
|
|
||||||
|
|
||||||
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
@ -90,11 +90,83 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
static Coordinate Grid_default_latt;
|
static Coordinate Grid_default_latt;
|
||||||
static Coordinate Grid_default_mpi;
|
static Coordinate Grid_default_mpi;
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Grid Norm logging for repro testing
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int GridNormLoggingMode;
|
||||||
|
int32_t GridNormLoggingCounter;
|
||||||
|
std::vector<double> GridNormLogVector;
|
||||||
|
|
||||||
|
void SetGridNormLoggingMode(GridNormLoggingMode_t mode)
|
||||||
|
{
|
||||||
|
switch ( mode ) {
|
||||||
|
case GridNormLoggingModePrint:
|
||||||
|
SetGridNormLoggingModePrint();
|
||||||
|
break;
|
||||||
|
case GridNormLoggingModeRecord:
|
||||||
|
SetGridNormLoggingModeRecord();
|
||||||
|
break;
|
||||||
|
case GridNormLoggingModeVerify:
|
||||||
|
SetGridNormLoggingModeVerify();
|
||||||
|
break;
|
||||||
|
case GridNormLoggingModeNone:
|
||||||
|
GridNormLoggingMode = mode;
|
||||||
|
GridNormLoggingCounter=0;
|
||||||
|
GridNormLogVector.resize(0);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetGridNormLoggingModePrint(void)
|
||||||
|
{
|
||||||
|
GridNormLoggingCounter = 0;
|
||||||
|
GridNormLogVector.resize(0);
|
||||||
|
GridNormLoggingMode = GridNormLoggingModePrint;
|
||||||
|
}
|
||||||
|
void SetGridNormLoggingModeRecord(void)
|
||||||
|
{
|
||||||
|
GridNormLoggingCounter = 0;
|
||||||
|
GridNormLogVector.resize(0);
|
||||||
|
GridNormLoggingMode = GridNormLoggingModeRecord;
|
||||||
|
}
|
||||||
|
void SetGridNormLoggingModeVerify(void)
|
||||||
|
{
|
||||||
|
GridNormLoggingCounter = 0;
|
||||||
|
GridNormLoggingMode = GridNormLoggingModeVerify;
|
||||||
|
}
|
||||||
|
void GridNormLog(double value)
|
||||||
|
{
|
||||||
|
if(GridNormLoggingMode == GridNormLoggingModePrint) {
|
||||||
|
std::cerr<<"GridNormLog : "<< GridNormLoggingCounter <<" " << std::hexfloat << value <<std::endl;
|
||||||
|
GridNormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(GridNormLoggingMode == GridNormLoggingModeRecord) {
|
||||||
|
GridNormLogVector.push_back(value);
|
||||||
|
GridNormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(GridNormLoggingMode == GridNormLoggingModeVerify) {
|
||||||
|
assert(GridNormLoggingCounter < GridNormLogVector.size());
|
||||||
|
if ( value != GridNormLogVector[GridNormLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s Oops, I did it again! Reproduce failure for norm %d/%zu %.16e %.16e\n",GridHostname(),GridNormLoggingCounter,GridNormLogVector.size(),
|
||||||
|
value, GridNormLogVector[GridNormLoggingCounter]); fflush(stderr);
|
||||||
|
}
|
||||||
|
GridNormLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int GridThread::_threads =1;
|
int GridThread::_threads =1;
|
||||||
int GridThread::_hyperthreads=1;
|
int GridThread::_hyperthreads=1;
|
||||||
int GridThread::_cores=1;
|
int GridThread::_cores=1;
|
||||||
|
|
||||||
|
char hostname[HOST_NAME_MAX+1];
|
||||||
|
|
||||||
|
char *GridHostname(void)
|
||||||
|
{
|
||||||
|
return hostname;
|
||||||
|
}
|
||||||
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
||||||
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
||||||
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
||||||
@ -397,7 +469,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
|
||||||
char hostname[HOST_NAME_MAX+1];
|
|
||||||
gethostname(hostname, HOST_NAME_MAX+1);
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
||||||
|
|
||||||
|
@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
void Grid_init(int *argc,char ***argv);
|
void Grid_init(int *argc,char ***argv);
|
||||||
void Grid_finalize(void);
|
void Grid_finalize(void);
|
||||||
|
|
||||||
|
char * GridHostname(void);
|
||||||
|
|
||||||
// internal, controled with --handle
|
// internal, controled with --handle
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
void Grid_debug_handler_init(void);
|
void Grid_debug_handler_init(void);
|
||||||
@ -68,5 +70,20 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
void printHash(void);
|
void printHash(void);
|
||||||
|
|
||||||
|
|
||||||
|
enum GridNormLoggingMode_t {
|
||||||
|
GridNormLoggingModeNone,
|
||||||
|
GridNormLoggingModePrint,
|
||||||
|
GridNormLoggingModeRecord,
|
||||||
|
GridNormLoggingModeVerify
|
||||||
|
};
|
||||||
|
extern int GridNormLoggingMode;
|
||||||
|
extern int32_t GridNormLoggingCounter;
|
||||||
|
extern std::vector<double> GridNormLogVector;
|
||||||
|
void SetGridNormLoggingModePrint(void);
|
||||||
|
void SetGridNormLoggingModeRecord(void);
|
||||||
|
void SetGridNormLoggingModeVerify(void);
|
||||||
|
void SetGridNormLoggingMode(GridNormLoggingMode_t mode);
|
||||||
|
void GridNormLog(double value);
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
41
systems/Aurora/tests/repro128.pbs
Normal file
41
systems/Aurora/tests/repro128.pbs
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
|
#PBS -q EarlyAppAccess
|
||||||
|
#PBS -l select=128
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=3
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 16 nodes, 192 ranks
|
||||||
|
# 12 ppn, 128 nodes, 1536 ranks
|
||||||
|
CMD="mpiexec -np 1536 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
|
||||||
|
$CMD
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#PBS -q EarlyAppAccess
|
#PBS -q EarlyAppAccess
|
||||||
#PBS -l select=16
|
#PBS -l select=16
|
||||||
#PBS -l walltime=01:00:00
|
#PBS -l walltime=02:00:00
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
#export OMP_PROC_BIND=spread
|
#export OMP_PROC_BIND=spread
|
||||||
@ -36,5 +36,6 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
CMD="mpiexec -np 192 -ppn 12 -envall \
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
|
./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
|
||||||
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 "
|
||||||
|
#--comms-overlap
|
||||||
$CMD
|
$CMD
|
||||||
|
@ -36,5 +36,5 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
CMD="mpiexec -np 192 -ppn 12 -envall \
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
|
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
|
||||||
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
|
||||||
$CMD
|
$CMD
|
||||||
|
@ -108,6 +108,11 @@ int main (int argc, char ** argv)
|
|||||||
csumref=0;
|
csumref=0;
|
||||||
int iter=0;
|
int iter=0;
|
||||||
do {
|
do {
|
||||||
|
if ( iter == 0 ) {
|
||||||
|
SetGridNormLoggingMode(GridNormLoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
SetGridNormLoggingMode(GridNormLoggingModeVerify);
|
||||||
|
}
|
||||||
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
||||||
result_o = Zero();
|
result_o = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -139,6 +144,11 @@ int main (int argc, char ** argv)
|
|||||||
csumref=0;
|
csumref=0;
|
||||||
int i=0;
|
int i=0;
|
||||||
do {
|
do {
|
||||||
|
if ( iter == 0 ) {
|
||||||
|
SetGridNormLoggingMode(GridNormLoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
SetGridNormLoggingMode(GridNormLoggingModeVerify);
|
||||||
|
}
|
||||||
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
||||||
result_o_2 = Zero();
|
result_o_2 = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
Reference in New Issue
Block a user