1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-11-05 14:29:31 +00:00

Compare commits

...

10 Commits

Author SHA1 Message Date
73af020f98 improved 2025-06-27 06:08:54 +00:00
bffb83c46e std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;

--dylib-map : Grid prints its dylib regions
--heartbeat : itimer based / SIGALRM wake up which seems to make Aurora
more stable
--debug-heartbeat : periodically report to stderr where we are in code

Now have libunwind option (configure: --with-unwind=<prefix>) to give an
Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-06-27 06:08:54 +00:00
7031f37350 Use libunwind for backtrace as it is signal asynch safe 2025-06-27 06:08:54 +00:00
829dd74cb2 Verbose change 2025-06-27 06:08:54 +00:00
66e671985d P2P 2025-06-27 06:08:54 +00:00
5afcbcf0f3 Cshift uses flight recorder 2025-06-27 06:08:54 +00:00
9730579312 Simplify and verbose 2025-06-27 06:08:51 +00:00
bfae14d035 More flight logging 2025-06-27 06:07:34 +00:00
b78fc73d19 Better signal handler 2025-06-27 06:07:34 +00:00
Peter Boyle
709f8ae76c Update README 2025-06-26 23:06:11 -04:00
10 changed files with 356 additions and 95 deletions

View File

@@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d)
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce");
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce");
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce");
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce");
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
@@ -301,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
@@ -836,6 +841,7 @@ int CartesianCommunicator::RankWorld(void){
return r;
}
void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}

View File

@@ -544,19 +544,20 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n");
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
// acceleratorPin(HostCommBuf,bytes);
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << Mheader " acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
if ( WorldRank == 0 ){
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -587,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
@@ -647,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
// std::cout<<"mapping seeking remote pid/fd "
// <<handle.pid<<"/"
// <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
@@ -662,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -671,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
@@ -754,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};

View File

@@ -143,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else {
int words = buffer_size;
@@ -153,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor;
int recv_from_rank;
@@ -166,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tcomms-=usecond();
grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
@@ -182,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);

View File

@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm);
grid->GlobalSumP2P(nrm);
// grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm));

View File

@@ -240,7 +240,7 @@ void acceleratorInit(void)
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){

View File

@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <cstdlib>
#include <memory>
#include <Grid/Grid.h>
#include <Grid/util/CompilerCompatible.h>
#ifdef HAVE_UNWIND
#include <libunwind.h>
#endif
#include <fenv.h>
#ifdef __APPLE__
@@ -295,6 +299,20 @@ void GridBanner(void)
std::cout << std::setprecision(9);
}
//Some file local variables
static int fileno_stdout;
static int fileno_stderr;
static int signal_delay;
class dlRegion {
public:
uint64_t start;
uint64_t end;
uint64_t size;
uint64_t offset;
std::string name;
};
std::vector<dlRegion> dlMap;
void Grid_init(int *argc,char ***argv)
{
@@ -347,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init();
}
// Sleep n-seconds at end of handler
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
GridCmdOptionInt(arg,signal_delay);
}
// periodic wakeup with stack trace printed
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
Grid_debug_heartbeat();
}
// periodic wakeup with empty handler (interrupts some system calls)
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
Grid_heartbeat();
}
#if defined(A64FX)
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -396,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
fp=freopen(ename.str().c_str(),"w",stderr);
assert(fp!=(FILE *)NULL);
}
fileno_stdout = fileno(stdout);
fileno_stderr = fileno(stderr) ;
////////////////////////////////////////////////////
// OK to use GridLogMessage etc from here on
////////////////////////////////////////////////////
std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
gethostname(hostname, HOST_NAME_MAX+1);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
{
gethostname(hostname, HOST_NAME_MAX+1);
time_t mytime;
struct tm *info;
char buffer[80];
time(&mytime);
info = localtime(&mytime);
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
}
/////////////////////////////////////////////////////////
// Reporting
@@ -421,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
MemoryProfiler::stats = &dbgMemStats;
}
/////////////////////////////////////////////////////////
// LD.so space
/////////////////////////////////////////////////////////
#ifndef __APPLE__
{
// Provides mapping of .so files
FILE *f = fopen("/proc/self/maps", "r");
if (f) {
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strstr(line, "r-xp")) {
dlRegion region;
uint32_t major, minor, inode;
uint64_t start,end,offset;
char path[PATH_MAX];
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
&start,&end,&offset,
&major,&minor,&inode,path);
region.start=start;
region.end =end;
region.offset=offset;
region.name = std::string(path);
region.size = region.end-region.start;
dlMap.push_back(region);
// std::cout << GridLogMessage<< line;
}
}
fclose(f);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
for(int r=0;r<dlMap.size();r++){
auto region = dlMap[r];
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
}
std::cout << GridLogMessage << "================================================ "<<std::endl;
}
}
#endif
////////////////////////////////////
// Logging
////////////////////////////////////
@@ -453,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
@@ -555,17 +642,56 @@ void GridLogLayout() {
}
void * Grid_backtrace_buffer[_NBACKTRACE];
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
void sig_print_dig(uint32_t dig)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// x86 64bit
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
if ( dig>=0 && dig< 16){
SIGLOG(digits[dig]);
}
}
void sig_print_uint(uint32_t A)
{
int dig;
int nz=0;
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
DIGIT(1000000000); // Catches 4BN = 2^32
DIGIT(100000000);
DIGIT(10000000);
DIGIT(1000000);
DIGIT(100000);
DIGIT(10000);
DIGIT(1000);
DIGIT(100);
DIGIT(10);
DIGIT(1);
if (nz==0) SIGLOG("0");
}
void sig_print_hex(uint64_t A)
{
int nz=0;
int dig;
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
SIGLOG("0x");
NIBBLE((A>>(15*4))&0xF);
NIBBLE((A>>(14*4))&0xF);
NIBBLE((A>>(13*4))&0xF);
NIBBLE((A>>(12*4))&0xF);
NIBBLE((A>>(11*4))&0xF);
NIBBLE((A>>(10*4))&0xF);
NIBBLE((A>>(9*4))&0xF);
NIBBLE((A>>(8*4))&0xF);
NIBBLE((A>>(7*4))&0xF);
NIBBLE((A>>(6*4))&0xF);
NIBBLE((A>>(5*4))&0xF);
NIBBLE((A>>(4*4))&0xF);
NIBBLE((A>>(3*4))&0xF);
NIBBLE((A>>(2*4))&0xF);
NIBBLE((A>>4)&0xF);
sig_print_dig(A&0xF);
}
/*
#ifdef __linux__
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
@@ -573,80 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
*/
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
{
SIGLOG("Signal handler on host ");
SIGLOG(hostname);
SIGLOG(" process id ");
sig_print_uint((uint32_t)getpid());
SIGLOG("\n");
SIGLOG("FlightRecorder step ");
sig_print_uint(FlightRecorder::StepLoggingCounter);
SIGLOG(" stage ");
SIGLOG(FlightRecorder::StepName);
SIGLOG("\n");
SIGLOG("Caught signal ");
sig_print_uint(si->si_signo);
SIGLOG("\n");
SIGLOG(" mem address ");
sig_print_hex((uint64_t)si->si_addr);
SIGLOG("\n");
SIGLOG(" code ");
sig_print_uint(si->si_code);
SIGLOG("\n");
ucontext_t *uc= (ucontext_t *)ptr;
SIGLOG("Backtrace:\n");
#ifdef HAVE_UNWIND
// Debug cross check on offsets
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
unw_cursor_t cursor;
unw_word_t ip, off;
if (!unw_init_local(&cursor, uc) ) {
SIGLOG(" frame IP function\n");
int level = 0;
int ret = 0;
while(1) {
char name[128];
if (level >= _NBACKTRACE) return;
unw_get_reg(&cursor, UNW_REG_IP, &ip);
sig_print_uint(level); SIGLOG(" ");
sig_print_hex(ip); SIGLOG(" ");
for(int r=0;r<dlMap.size();r++){
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
SIGLOG(dlMap[r].name.c_str());
SIGLOG("+");
sig_print_hex((ip-dlMap[r].start));
break;
}
}
SIGLOG("\n");
Grid_backtrace_buffer[level]=(void *)ip;
level++;
ret = unw_step(&cursor);
if (ret <= 0) {
return;
}
}
}
#else
// Known Asynch-Signal unsafe
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
#endif
}
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
SIGLOG("\n");
}
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
if (signal_delay) {
SIGLOG("Adding extra signal delay ");
sig_print_uint(signal_delay);
SIGLOG(" s\n");
usleep( (uint64_t) signal_delay*1000LL*1000LL);
}
SIGLOG("\n");
return;
}
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// Linux/Posix
#ifdef __linux__
// And x86 64bit
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
REG(rbx);
REG(rdx);
REG(rax);
REG(rcx);
REG(rsp);
REG(rip);
REG(r8);
REG(r9);
REG(r10);
REG(r11);
REG(r12);
REG(r13);
REG(r14);
REG(r15);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
Grid_generic_handler(sig,si,ptr);
SIGLOG("\n");
exit(0);
return;
};
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
{
// SIGLOG("heartbeat signal handled\n");
return;
}
void Grid_debug_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 0;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_empty_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 1000;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_exit_handler(void)
{
// BACKTRACEFP(stdout);
// fflush(stdout);
BACKTRACEFP(stdout);
fflush(stdout);
}
void Grid_debug_handler_init(void)
{
struct sigaction sa;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_sigaction= Grid_fatal_signal_handler;
sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
// sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGUSR2,&sa,NULL);
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
// sigaction(SIGFPE,&sa,NULL);
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
#ifndef GRID_SYCL
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
sigaction(SIGBUS,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
#endif
// Non terminating SIGUSR1/2 handler
// Non terminating SIGHUP handler
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;

View File

@@ -38,7 +38,11 @@ char * GridHostname(void);
// internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);
void Grid_debug_heartbeat(void);
void Grid_heartbeat(void);
void Grid_quiesce_nodes(void);
void Grid_unquiesce_nodes(void);

View File

@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
[try this for a non-standard install prefix of the GMP library])],
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
AC_ARG_WITH([mpfr],
[AS_HELP_STRING([--with-mpfr=prefix],
[try this for a non-standard install prefix of the MPFR library])],
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
############### LIBUNWIND
AC_ARG_WITH([unwind],
[AS_HELP_STRING([--with-unwind=prefix],
[try this for a non-standard install prefix of the libunwind library])],
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
############### OpenSSL
AC_ARG_WITH([openssl],
[AS_HELP_STRING([--with-openssl=prefix],
@@ -373,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
[have_lime=true],
[AC_MSG_WARN(LIME library was not found in your system.)])
AC_SEARCH_LIBS([unw_backtrace], [unwind],
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
[have_unwind=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[have_unwind_x86_64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([SHA256_Init], [crypto],
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
[have_crypto=true],

View File

@@ -179,8 +179,8 @@ int main(int argc, char** argv) {
Np=LanParams.Np;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
int MaxIt = 100;
RealD resid = 1.0e-4;
//while ( mass > - 5.0){

View File

@@ -125,4 +125,29 @@ Extensions
8) Example python code: FieldDensity.py . This is not interfaced to Grid.
================
Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
================
Hi Peter,
To make visualization work on Frontier, I did the following.
For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
VTK_OPENGL_HAS_* off
VTK_USE_X off
VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
VTK_DEFAULT_RENDER_WINDOW_HEADLESS on
The list can be greater than necessary.
VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm). Either mesa or meson package requires llvm-config, which is included after rocm6.1. On Frontier, I used /opt/rocm-6.2.4. The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin. So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm. Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary.
After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake. There will be more elegant method, but I was not able to find a tab to switch on OSMesa.
When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
Best,
Shuhei