mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-05 14:29:31 +00:00
Compare commits
10 Commits
7aa06329d0
...
73af020f98
| Author | SHA1 | Date | |
|---|---|---|---|
| 73af020f98 | |||
| bffb83c46e | |||
| 7031f37350 | |||
| 829dd74cb2 | |||
| 66e671985d | |||
| 5afcbcf0f3 | |||
| 9730579312 | |||
| bfae14d035 | |||
| b78fc73d19 | |||
|
|
709f8ae76c |
@@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d)
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
FlightRecorder::StepLog("AllReduce float");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
FlightRecorder::StepLog("AllReduce double");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
FlightRecorder::StepLog("AllReduce uint32_t");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
FlightRecorder::StepLog("AllReduce uint64_t");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@@ -301,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||
FlightRecorder::StepLog("GlobalXOR");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalMax(float &f)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalMax");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalMax(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalMax");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumVector(float *)");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumVector(double *)");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@@ -836,6 +841,7 @@ int CartesianCommunicator::RankWorld(void){
|
||||
return r;
|
||||
}
|
||||
void CartesianCommunicator::BarrierWorld(void){
|
||||
FlightRecorder::StepLog("BarrierWorld");
|
||||
int ierr = MPI_Barrier(communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
@@ -544,19 +544,20 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
// acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
std::cout << Mheader " acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
std::cout<< "Setting up IPC"<<std::endl;
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout<< Mheader "Setting up IPC"<<std::endl;
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -587,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
}
|
||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||
handle.pid = getpid();
|
||||
@@ -647,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef SHM_SOCKETS
|
||||
myfd=UnixSockets::RecvFileDescriptor();
|
||||
#else
|
||||
std::cout<<"mapping seeking remote pid/fd "
|
||||
<<handle.pid<<"/"
|
||||
<<handle.fd<<std::endl;
|
||||
// std::cout<<"mapping seeking remote pid/fd "
|
||||
// <<handle.pid<<"/"
|
||||
// <<handle.fd<<std::endl;
|
||||
|
||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||
myfd = syscall(438,pidfd,handle.fd,0);
|
||||
int err_t = errno;
|
||||
@@ -662,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||
|
||||
@@ -671,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
|
||||
}
|
||||
assert(thisBuf!=nullptr);
|
||||
}
|
||||
@@ -754,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||
}
|
||||
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
|
||||
_ShmAlloc=1;
|
||||
_ShmAllocBytes = bytes;
|
||||
};
|
||||
|
||||
@@ -143,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
if (comm_proc==0) {
|
||||
FlightRecorder::StepLog("Cshift_Copy_plane");
|
||||
tcopy-=usecond();
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
tcopy+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
@@ -153,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
FlightRecorder::StepLog("Cshift_Gather_plane");
|
||||
tgather-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
|
||||
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
@@ -166,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tcomms-=usecond();
|
||||
grid->Barrier();
|
||||
|
||||
FlightRecorder::StepLog("Cshift_SendRecv");
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
@@ -182,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
bytes);
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
FlightRecorder::StepLog("Cshift_SendRecv_complete");
|
||||
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_barrier_complete");
|
||||
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||
|
||||
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::StepLog("Start global sum");
|
||||
// grid->GlobalSumP2P(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
grid->GlobalSumP2P(nrm);
|
||||
// grid->GlobalSum(nrm);
|
||||
FlightRecorder::StepLog("Finished global sum");
|
||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
|
||||
@@ -240,7 +240,7 @@ void acceleratorInit(void)
|
||||
|
||||
char hostname[HOST_NAME_MAX+1];
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
|
||||
auto devices = sycl::device::get_devices();
|
||||
for(int d = 0;d<devices.size();d++){
|
||||
|
||||
@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#include <Grid/util/CompilerCompatible.h>
|
||||
|
||||
#ifdef HAVE_UNWIND
|
||||
#include <libunwind.h>
|
||||
#endif
|
||||
|
||||
#include <fenv.h>
|
||||
#ifdef __APPLE__
|
||||
@@ -295,6 +299,20 @@ void GridBanner(void)
|
||||
std::cout << std::setprecision(9);
|
||||
}
|
||||
|
||||
//Some file local variables
|
||||
static int fileno_stdout;
|
||||
static int fileno_stderr;
|
||||
static int signal_delay;
|
||||
class dlRegion {
|
||||
public:
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
uint64_t size;
|
||||
uint64_t offset;
|
||||
std::string name;
|
||||
};
|
||||
std::vector<dlRegion> dlMap;
|
||||
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
|
||||
@@ -347,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
// Sleep n-seconds at end of handler
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
|
||||
GridCmdOptionInt(arg,signal_delay);
|
||||
}
|
||||
// periodic wakeup with stack trace printed
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
|
||||
Grid_debug_heartbeat();
|
||||
}
|
||||
// periodic wakeup with empty handler (interrupts some system calls)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
|
||||
Grid_heartbeat();
|
||||
}
|
||||
|
||||
#if defined(A64FX)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||
@@ -396,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
|
||||
fp=freopen(ename.str().c_str(),"w",stderr);
|
||||
assert(fp!=(FILE *)NULL);
|
||||
}
|
||||
fileno_stdout = fileno(stdout);
|
||||
fileno_stderr = fileno(stderr) ;
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// OK to use GridLogMessage etc from here on
|
||||
////////////////////////////////////////////////////
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
||||
{
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
time_t mytime;
|
||||
struct tm *info;
|
||||
char buffer[80];
|
||||
time(&mytime);
|
||||
info = localtime(&mytime);
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
|
||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
@@ -421,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
|
||||
MemoryProfiler::stats = &dbgMemStats;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// LD.so space
|
||||
/////////////////////////////////////////////////////////
|
||||
#ifndef __APPLE__
|
||||
{
|
||||
// Provides mapping of .so files
|
||||
FILE *f = fopen("/proc/self/maps", "r");
|
||||
if (f) {
|
||||
char line[256];
|
||||
while (fgets(line, sizeof(line), f)) {
|
||||
if (strstr(line, "r-xp")) {
|
||||
dlRegion region;
|
||||
uint32_t major, minor, inode;
|
||||
uint64_t start,end,offset;
|
||||
char path[PATH_MAX];
|
||||
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
|
||||
&start,&end,&offset,
|
||||
&major,&minor,&inode,path);
|
||||
region.start=start;
|
||||
region.end =end;
|
||||
region.offset=offset;
|
||||
region.name = std::string(path);
|
||||
region.size = region.end-region.start;
|
||||
dlMap.push_back(region);
|
||||
// std::cout << GridLogMessage<< line;
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
for(int r=0;r<dlMap.size();r++){
|
||||
auto region = dlMap[r];
|
||||
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////
|
||||
// Logging
|
||||
////////////////////////////////////
|
||||
@@ -453,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
@@ -555,17 +642,56 @@ void GridLogLayout() {
|
||||
}
|
||||
|
||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
|
||||
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
void sig_print_dig(uint32_t dig)
|
||||
{
|
||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
fprintf(stderr," code %d\n",si->si_code);
|
||||
// x86 64bit
|
||||
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
|
||||
if ( dig>=0 && dig< 16){
|
||||
SIGLOG(digits[dig]);
|
||||
}
|
||||
}
|
||||
void sig_print_uint(uint32_t A)
|
||||
{
|
||||
int dig;
|
||||
int nz=0;
|
||||
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||
DIGIT(1000000000); // Catches 4BN = 2^32
|
||||
DIGIT(100000000);
|
||||
DIGIT(10000000);
|
||||
DIGIT(1000000);
|
||||
DIGIT(100000);
|
||||
DIGIT(10000);
|
||||
DIGIT(1000);
|
||||
DIGIT(100);
|
||||
DIGIT(10);
|
||||
DIGIT(1);
|
||||
if (nz==0) SIGLOG("0");
|
||||
}
|
||||
void sig_print_hex(uint64_t A)
|
||||
{
|
||||
int nz=0;
|
||||
int dig;
|
||||
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||
SIGLOG("0x");
|
||||
NIBBLE((A>>(15*4))&0xF);
|
||||
NIBBLE((A>>(14*4))&0xF);
|
||||
NIBBLE((A>>(13*4))&0xF);
|
||||
NIBBLE((A>>(12*4))&0xF);
|
||||
NIBBLE((A>>(11*4))&0xF);
|
||||
NIBBLE((A>>(10*4))&0xF);
|
||||
NIBBLE((A>>(9*4))&0xF);
|
||||
NIBBLE((A>>(8*4))&0xF);
|
||||
NIBBLE((A>>(7*4))&0xF);
|
||||
NIBBLE((A>>(6*4))&0xF);
|
||||
NIBBLE((A>>(5*4))&0xF);
|
||||
NIBBLE((A>>(4*4))&0xF);
|
||||
NIBBLE((A>>(3*4))&0xF);
|
||||
NIBBLE((A>>(2*4))&0xF);
|
||||
NIBBLE((A>>4)&0xF);
|
||||
sig_print_dig(A&0xF);
|
||||
}
|
||||
/*
|
||||
#ifdef __linux__
|
||||
#ifdef __x86_64__
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
@@ -573,80 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||
#endif
|
||||
#endif
|
||||
fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
fprintf(stderr,"Called backtrace\n");
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
*/
|
||||
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
SIGLOG("Signal handler on host ");
|
||||
SIGLOG(hostname);
|
||||
SIGLOG(" process id ");
|
||||
sig_print_uint((uint32_t)getpid());
|
||||
SIGLOG("\n");
|
||||
SIGLOG("FlightRecorder step ");
|
||||
sig_print_uint(FlightRecorder::StepLoggingCounter);
|
||||
SIGLOG(" stage ");
|
||||
SIGLOG(FlightRecorder::StepName);
|
||||
SIGLOG("\n");
|
||||
SIGLOG("Caught signal ");
|
||||
sig_print_uint(si->si_signo);
|
||||
SIGLOG("\n");
|
||||
SIGLOG(" mem address ");
|
||||
sig_print_hex((uint64_t)si->si_addr);
|
||||
SIGLOG("\n");
|
||||
SIGLOG(" code ");
|
||||
sig_print_uint(si->si_code);
|
||||
SIGLOG("\n");
|
||||
|
||||
ucontext_t *uc= (ucontext_t *)ptr;
|
||||
|
||||
SIGLOG("Backtrace:\n");
|
||||
#ifdef HAVE_UNWIND
|
||||
// Debug cross check on offsets
|
||||
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||
unw_cursor_t cursor;
|
||||
unw_word_t ip, off;
|
||||
if (!unw_init_local(&cursor, uc) ) {
|
||||
|
||||
SIGLOG(" frame IP function\n");
|
||||
int level = 0;
|
||||
int ret = 0;
|
||||
while(1) {
|
||||
char name[128];
|
||||
if (level >= _NBACKTRACE) return;
|
||||
|
||||
unw_get_reg(&cursor, UNW_REG_IP, &ip);
|
||||
|
||||
sig_print_uint(level); SIGLOG(" ");
|
||||
sig_print_hex(ip); SIGLOG(" ");
|
||||
for(int r=0;r<dlMap.size();r++){
|
||||
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
|
||||
SIGLOG(dlMap[r].name.c_str());
|
||||
SIGLOG("+");
|
||||
sig_print_hex((ip-dlMap[r].start));
|
||||
break;
|
||||
}
|
||||
}
|
||||
SIGLOG("\n");
|
||||
Grid_backtrace_buffer[level]=(void *)ip;
|
||||
level++;
|
||||
ret = unw_step(&cursor);
|
||||
if (ret <= 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// Known Asynch-Signal unsafe
|
||||
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
SIGLOG("\n");
|
||||
}
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
if (signal_delay) {
|
||||
SIGLOG("Adding extra signal delay ");
|
||||
sig_print_uint(signal_delay);
|
||||
SIGLOG(" s\n");
|
||||
usleep( (uint64_t) signal_delay*1000LL*1000LL);
|
||||
}
|
||||
SIGLOG("\n");
|
||||
return;
|
||||
}
|
||||
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
fprintf(stderr," code %d\n",si->si_code);
|
||||
// Linux/Posix
|
||||
#ifdef __linux__
|
||||
// And x86 64bit
|
||||
#ifdef __x86_64__
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
||||
REG(rdi);
|
||||
REG(rsi);
|
||||
REG(rbp);
|
||||
REG(rbx);
|
||||
REG(rdx);
|
||||
REG(rax);
|
||||
REG(rcx);
|
||||
REG(rsp);
|
||||
REG(rip);
|
||||
|
||||
|
||||
REG(r8);
|
||||
REG(r9);
|
||||
REG(r10);
|
||||
REG(r11);
|
||||
REG(r12);
|
||||
REG(r13);
|
||||
REG(r14);
|
||||
REG(r15);
|
||||
#endif
|
||||
#endif
|
||||
fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
fprintf(stderr,"Called backtrace\n");
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
SIGLOG("\n");
|
||||
exit(0);
|
||||
return;
|
||||
};
|
||||
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
// SIGLOG("heartbeat signal handled\n");
|
||||
return;
|
||||
}
|
||||
void Grid_debug_heartbeat(void)
|
||||
{
|
||||
struct sigaction sa_ping;
|
||||
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||
sa_ping.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGALRM,&sa_ping,NULL);
|
||||
|
||||
// repeating 10s heartbeat
|
||||
struct itimerval it_val;
|
||||
it_val.it_value.tv_sec = 10;
|
||||
it_val.it_value.tv_usec = 0;
|
||||
it_val.it_interval = it_val.it_value;
|
||||
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||
}
|
||||
void Grid_heartbeat(void)
|
||||
{
|
||||
struct sigaction sa_ping;
|
||||
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_empty_signal_handler;
|
||||
sa_ping.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGALRM,&sa_ping,NULL);
|
||||
|
||||
// repeating 10s heartbeat
|
||||
struct itimerval it_val;
|
||||
it_val.it_value.tv_sec = 10;
|
||||
it_val.it_value.tv_usec = 1000;
|
||||
it_val.it_interval = it_val.it_value;
|
||||
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||
}
|
||||
void Grid_exit_handler(void)
|
||||
{
|
||||
// BACKTRACEFP(stdout);
|
||||
// fflush(stdout);
|
||||
BACKTRACEFP(stdout);
|
||||
fflush(stdout);
|
||||
}
|
||||
void Grid_debug_handler_init(void)
|
||||
{
|
||||
struct sigaction sa;
|
||||
sigemptyset (&sa.sa_mask);
|
||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||
sa.sa_sigaction= Grid_fatal_signal_handler;
|
||||
sa.sa_flags = SA_SIGINFO;
|
||||
// sigaction(SIGSEGV,&sa,NULL);
|
||||
sigaction(SIGTRAP,&sa,NULL);
|
||||
// sigaction(SIGBUS,&sa,NULL);
|
||||
// sigaction(SIGUSR2,&sa,NULL);
|
||||
|
||||
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||
// sigaction(SIGFPE,&sa,NULL);
|
||||
sigaction(SIGKILL,&sa,NULL);
|
||||
sigaction(SIGILL,&sa,NULL);
|
||||
#ifndef GRID_SYCL
|
||||
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
|
||||
sigaction(SIGBUS,&sa,NULL);
|
||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||
sigaction(SIGFPE,&sa,NULL);
|
||||
#endif
|
||||
|
||||
// Non terminating SIGUSR1/2 handler
|
||||
// Non terminating SIGHUP handler
|
||||
struct sigaction sa_ping;
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||
|
||||
@@ -38,7 +38,11 @@ char * GridHostname(void);
|
||||
|
||||
// internal, controled with --handle
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_debug_handler_init(void);
|
||||
void Grid_debug_heartbeat(void);
|
||||
void Grid_heartbeat(void);
|
||||
void Grid_quiesce_nodes(void);
|
||||
void Grid_unquiesce_nodes(void);
|
||||
|
||||
|
||||
18
configure.ac
18
configure.ac
@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
|
||||
[try this for a non-standard install prefix of the GMP library])],
|
||||
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
|
||||
|
||||
AC_ARG_WITH([mpfr],
|
||||
[AS_HELP_STRING([--with-mpfr=prefix],
|
||||
[try this for a non-standard install prefix of the MPFR library])],
|
||||
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
|
||||
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
|
||||
|
||||
############### LIBUNWIND
|
||||
AC_ARG_WITH([unwind],
|
||||
[AS_HELP_STRING([--with-unwind=prefix],
|
||||
[try this for a non-standard install prefix of the libunwind library])],
|
||||
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
|
||||
|
||||
############### OpenSSL
|
||||
AC_ARG_WITH([openssl],
|
||||
[AS_HELP_STRING([--with-openssl=prefix],
|
||||
@@ -373,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
|
||||
[have_lime=true],
|
||||
[AC_MSG_WARN(LIME library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([unw_backtrace], [unwind],
|
||||
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
|
||||
[have_unwind=true],
|
||||
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
|
||||
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
|
||||
[have_unwind_x86_64=true],
|
||||
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([SHA256_Init], [crypto],
|
||||
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
|
||||
[have_crypto=true],
|
||||
|
||||
@@ -179,8 +179,8 @@ int main(int argc, char** argv) {
|
||||
Np=LanParams.Np;
|
||||
|
||||
int Nm = Nk + Np;
|
||||
int MaxIt = 10000;
|
||||
RealD resid = 1.0e-5;
|
||||
int MaxIt = 100;
|
||||
RealD resid = 1.0e-4;
|
||||
|
||||
|
||||
//while ( mass > - 5.0){
|
||||
|
||||
@@ -125,4 +125,29 @@ Extensions
|
||||
|
||||
8) Example python code: FieldDensity.py . This is not interfaced to Grid.
|
||||
|
||||
================
|
||||
Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
|
||||
================
|
||||
Hi Peter,
|
||||
|
||||
To make visualization work on Frontier, I did the following.
|
||||
|
||||
For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
|
||||
VTK_OPENGL_HAS_* off
|
||||
VTK_USE_X off
|
||||
VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
|
||||
VTK_DEFAULT_RENDER_WINDOW_HEADLESS on
|
||||
The list can be greater than necessary.
|
||||
|
||||
VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm). Either mesa or meson package requires llvm-config, which is included after rocm6.1. On Frontier, I used /opt/rocm-6.2.4. The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin. So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm. Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary.
|
||||
After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
|
||||
|
||||
In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake. There will be more elegant method, but I was not able to find a tab to switch on OSMesa.
|
||||
|
||||
When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
|
||||
|
||||
Best,
|
||||
Shuhei
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user