mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			066544281f
			...
			9fa8bd6438
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 9fa8bd6438 | |||
| 02c8178f16 | |||
| e637fbacae | 
@@ -55,13 +55,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
 | 
			
		||||
  RealD t1,t0;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  if ( !comm_dim ) {
 | 
			
		||||
    std::cout << "CSHIFT: Cshift_local" <<std::endl;
 | 
			
		||||
    //    std::cout << "CSHIFT: Cshift_local" <<std::endl;
 | 
			
		||||
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
 | 
			
		||||
  } else if ( splice_dim ) {
 | 
			
		||||
    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
 | 
			
		||||
    //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
 | 
			
		||||
    Cshift_comms_simd(ret,rhs,dimension,shift);
 | 
			
		||||
  } else {
 | 
			
		||||
    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
 | 
			
		||||
    //    std::cout << "CSHIFT: Cshift_comms" <<std::endl;
 | 
			
		||||
    Cshift_comms(ret,rhs,dimension,shift);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
@@ -76,12 +76,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
 | 
			
		||||
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
 | 
			
		||||
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
 | 
			
		||||
 | 
			
		||||
  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
			
		||||
  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
			
		||||
  if ( sshift[0] == sshift[1] ) {
 | 
			
		||||
    std::cout << "Single pass Cshift_comms" <<std::endl;
 | 
			
		||||
    //    std::cout << "Single pass Cshift_comms" <<std::endl;
 | 
			
		||||
    Cshift_comms(ret,rhs,dimension,shift,0x3);
 | 
			
		||||
  } else {
 | 
			
		||||
    std::cout << "Two pass Cshift_comms" <<std::endl;
 | 
			
		||||
    //    std::cout << "Two pass Cshift_comms" <<std::endl;
 | 
			
		||||
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
			
		||||
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
 | 
			
		||||
  }
 | 
			
		||||
@@ -94,12 +94,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
 | 
			
		||||
  sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
 | 
			
		||||
  sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
 | 
			
		||||
 | 
			
		||||
  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
			
		||||
  //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
			
		||||
  if ( sshift[0] == sshift[1] ) {
 | 
			
		||||
    std::cout << "Single pass Cshift_comms" <<std::endl;
 | 
			
		||||
    //    std::cout << "Single pass Cshift_comms" <<std::endl;
 | 
			
		||||
    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
 | 
			
		||||
  } else {
 | 
			
		||||
    std::cout << "Two pass Cshift_comms" <<std::endl;
 | 
			
		||||
    //    std::cout << "Two pass Cshift_comms" <<std::endl;
 | 
			
		||||
    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
			
		||||
    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
 | 
			
		||||
  }
 | 
			
		||||
@@ -197,9 +197,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 | 
			
		||||
  int simd_layout     = grid->_simd_layout[dimension];
 | 
			
		||||
  int comm_dim        = grid->_processors[dimension] >1 ;
 | 
			
		||||
 | 
			
		||||
  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
 | 
			
		||||
	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
 | 
			
		||||
	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
 | 
			
		||||
  //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
 | 
			
		||||
  //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
 | 
			
		||||
  //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
 | 
			
		||||
 | 
			
		||||
  assert(comm_dim==1);
 | 
			
		||||
  assert(simd_layout==2);
 | 
			
		||||
 
 | 
			
		||||
@@ -325,12 +325,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
  // Start comms  // Gather intranode and extra node differentiated??
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << " WilsonFermion5D gather " <<std::endl;
 | 
			
		||||
    //    std::cout << " WilsonFermion5D gather " <<std::endl;
 | 
			
		||||
    GRID_TRACE("Gather");
 | 
			
		||||
    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
 | 
			
		||||
  std::vector<std::vector<CommsRequest_t> > requests;
 | 
			
		||||
  auto id=traceStart("Communicate overlapped");
 | 
			
		||||
  st.CommunicateBegin(requests);
 | 
			
		||||
@@ -339,7 +339,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
  // Overlap with comms
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
 | 
			
		||||
    //  std::cout << " WilsonFermion5D Comms merge " <<std::endl;
 | 
			
		||||
    GRID_TRACE("MergeSHM");
 | 
			
		||||
    st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
 | 
			
		||||
  }
 | 
			
		||||
@@ -347,7 +347,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  // do the compute interior
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  std::cout << " WilsonFermion5D Interior " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Interior " <<std::endl;
 | 
			
		||||
  int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    GRID_TRACE("DhopDagInterior");
 | 
			
		||||
@@ -360,7 +360,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  // Complete comms
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
 | 
			
		||||
  st.CommunicateComplete(requests);
 | 
			
		||||
  traceStop(id);
 | 
			
		||||
 | 
			
		||||
@@ -368,13 +368,13 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
  // do the compute exterior
 | 
			
		||||
  /////////////////////////////
 | 
			
		||||
  {
 | 
			
		||||
    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
 | 
			
		||||
    //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
 | 
			
		||||
    GRID_TRACE("Merge");
 | 
			
		||||
    st.CommsMerge(compressor);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  std::cout << " WilsonFermion5D Exterior " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Exterior " <<std::endl;
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    GRID_TRACE("DhopDagExterior");
 | 
			
		||||
    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
 | 
			
		||||
@@ -382,7 +382,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
 | 
			
		||||
    GRID_TRACE("DhopExterior");
 | 
			
		||||
    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " WilsonFermion5D Done " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -397,13 +397,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 | 
			
		||||
 | 
			
		||||
  int LLs = in.Grid()->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Halo exch " <<std::endl;
 | 
			
		||||
  {
 | 
			
		||||
    GRID_TRACE("HaloExchange");
 | 
			
		||||
    st.HaloExchangeOpt(in,compressor);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::cout << " WilsonFermion5D Dhop " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Dhop " <<std::endl;
 | 
			
		||||
  int Opt = WilsonKernelsStatic::Opt;
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    GRID_TRACE("DhopDag");
 | 
			
		||||
@@ -412,7 +412,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
 | 
			
		||||
    GRID_TRACE("Dhop");
 | 
			
		||||
    Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " WilsonFermion5D Done " <<std::endl;
 | 
			
		||||
  //  std::cout << " WilsonFermion5D Done " <<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -365,8 +365,8 @@ public:
 | 
			
		||||
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
 | 
			
		||||
  {
 | 
			
		||||
    // All GPU kernel tasks must complete
 | 
			
		||||
    //    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
			
		||||
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
			
		||||
    accelerator_barrier();     // All kernels should ALREADY be complete
 | 
			
		||||
    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
 | 
			
		||||
                               // But the HaloGather had a barrier too.
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 | 
			
		||||
@@ -390,8 +390,8 @@ public:
 | 
			
		||||
    if   ( this->partialDirichlet ) DslashLogPartial();
 | 
			
		||||
    else if ( this->fullDirichlet ) DslashLogDirichlet();
 | 
			
		||||
    else DslashLogFull();
 | 
			
		||||
    // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
 | 
			
		||||
    //    accelerator_barrier(); 
 | 
			
		||||
    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
 | 
			
		||||
    accelerator_barrier(); 
 | 
			
		||||
    _grid->StencilBarrier(); 
 | 
			
		||||
    // run any checksums
 | 
			
		||||
    for(int i=0;i<Packets.size();i++){
 | 
			
		||||
@@ -473,7 +473,7 @@ public:
 | 
			
		||||
  template<class compressor>
 | 
			
		||||
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
  {
 | 
			
		||||
    //    accelerator_barrier();
 | 
			
		||||
    accelerator_barrier();
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
 | 
			
		||||
    assert(source.Grid()==_grid);
 | 
			
		||||
@@ -487,6 +487,7 @@ public:
 | 
			
		||||
      HaloGatherDir(source,compress,point,face_idx);
 | 
			
		||||
    }
 | 
			
		||||
    accelerator_barrier(); // All my local gathers are complete
 | 
			
		||||
    _grid->StencilBarrier();// Synch shared memory on a single nodes
 | 
			
		||||
    face_table_computed=1;
 | 
			
		||||
    assert(u_comm_offset==_unified_buffer_size);
 | 
			
		||||
  }
 | 
			
		||||
@@ -653,7 +654,9 @@ public:
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
 | 
			
		||||
    surface_list.resize(surface_list_size);
 | 
			
		||||
    std::vector<int> surface_list_host(surface_list_size);
 | 
			
		||||
    int32_t ss=0;
 | 
			
		||||
    for(int site = 0 ;site< vol4;site++){
 | 
			
		||||
      int local = 1;
 | 
			
		||||
@@ -665,12 +668,12 @@ public:
 | 
			
		||||
      if(local == 0) {
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int idx=site*Ls+s;
 | 
			
		||||
	  acceleratorPut(surface_list[ss],idx);
 | 
			
		||||
	  surface_list_host[ss]= idx;
 | 
			
		||||
	  ss++;
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
 | 
			
		||||
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
 | 
			
		||||
  }
 | 
			
		||||
  /// Introduce a block structure and switch off comms on boundaries
 | 
			
		||||
  void DirichletBlock(const Coordinate &dirichlet_block)
 | 
			
		||||
 
 | 
			
		||||
@@ -549,8 +549,31 @@ void GridLogLayout() {
 | 
			
		||||
 | 
			
		||||
void * Grid_backtrace_buffer[_NBACKTRACE];
 | 
			
		||||
 | 
			
		||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
{
 | 
			
		||||
  fprintf(stderr,"Signal handler on host %s\n",hostname);
 | 
			
		||||
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
 | 
			
		||||
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
			
		||||
  fprintf(stderr,"         code %d\n",si->si_code);
 | 
			
		||||
  // x86 64bit
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
#ifdef __x86_64__
 | 
			
		||||
  ucontext_t * uc= (ucontext_t *)ptr;
 | 
			
		||||
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
			
		||||
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  fflush(stderr);
 | 
			
		||||
  BACKTRACEFP(stderr);
 | 
			
		||||
  fprintf(stderr,"Called backtrace\n");
 | 
			
		||||
  fflush(stdout);
 | 
			
		||||
  fflush(stderr);
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
{
 | 
			
		||||
  fprintf(stderr,"Signal handler on host %s\n",hostname);
 | 
			
		||||
  fprintf(stderr,"Caught signal %d\n",si->si_signo);
 | 
			
		||||
  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
			
		||||
  fprintf(stderr,"         code %d\n",si->si_code);
 | 
			
		||||
@@ -561,7 +584,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
  ucontext_t * uc= (ucontext_t *)ptr;
 | 
			
		||||
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
			
		||||
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 | 
			
		||||
#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
			
		||||
#define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
 | 
			
		||||
  REG(rdi);
 | 
			
		||||
  REG(rsi);
 | 
			
		||||
  REG(rbp);
 | 
			
		||||
@@ -594,8 +617,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
			
		||||
 | 
			
		||||
void Grid_exit_handler(void)
 | 
			
		||||
{
 | 
			
		||||
  BACKTRACEFP(stdout);
 | 
			
		||||
  fflush(stdout);
 | 
			
		||||
  //  BACKTRACEFP(stdout);
 | 
			
		||||
  //  fflush(stdout);
 | 
			
		||||
}
 | 
			
		||||
void Grid_debug_handler_init(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -603,10 +626,10 @@ void Grid_debug_handler_init(void)
 | 
			
		||||
  sigemptyset (&sa.sa_mask);
 | 
			
		||||
  sa.sa_sigaction= Grid_sa_signal_handler;
 | 
			
		||||
  sa.sa_flags    = SA_SIGINFO;
 | 
			
		||||
  sigaction(SIGSEGV,&sa,NULL);
 | 
			
		||||
  //  sigaction(SIGSEGV,&sa,NULL);
 | 
			
		||||
  sigaction(SIGTRAP,&sa,NULL);
 | 
			
		||||
  sigaction(SIGBUS,&sa,NULL);
 | 
			
		||||
  sigaction(SIGUSR2,&sa,NULL);
 | 
			
		||||
  //  sigaction(SIGUSR2,&sa,NULL);
 | 
			
		||||
 | 
			
		||||
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
 | 
			
		||||
 | 
			
		||||
@@ -614,7 +637,14 @@ void Grid_debug_handler_init(void)
 | 
			
		||||
  sigaction(SIGKILL,&sa,NULL);
 | 
			
		||||
  sigaction(SIGILL,&sa,NULL);
 | 
			
		||||
 | 
			
		||||
  atexit(Grid_exit_handler);
 | 
			
		||||
  // Non terminating SIGUSR1/2 handler
 | 
			
		||||
  struct sigaction sa_ping;
 | 
			
		||||
  sigemptyset (&sa_ping.sa_mask);
 | 
			
		||||
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
 | 
			
		||||
  sa_ping.sa_flags    = SA_SIGINFO;
 | 
			
		||||
  sigaction(SIGHUP,&sa_ping,NULL);
 | 
			
		||||
 | 
			
		||||
  //  atexit(Grid_exit_handler);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										23
									
								
								systems/Aurora-AOT/config-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora-AOT/config-command
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
#Ahead of time compile for PVC
 | 
			
		||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
 | 
			
		||||
 | 
			
		||||
#JIT compile 
 | 
			
		||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl " 
 | 
			
		||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions "
 | 
			
		||||
 | 
			
		||||
../../configure \
 | 
			
		||||
	--enable-simd=GPU \
 | 
			
		||||
	--enable-gen-simd-width=64 \
 | 
			
		||||
	--enable-comms=mpi-auto \
 | 
			
		||||
	--enable-debug \
 | 
			
		||||
	--disable-gparity \
 | 
			
		||||
	--disable-fermion-reps \
 | 
			
		||||
	--with-lime=$CLIME \
 | 
			
		||||
	--enable-shm=nvlink \
 | 
			
		||||
	--enable-accelerator=sycl \
 | 
			
		||||
	--enable-accelerator-aware-mpi=yes\
 | 
			
		||||
	--enable-unified=no \
 | 
			
		||||
	MPICXX=mpicxx \
 | 
			
		||||
	CXX=icpx 
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										15
									
								
								systems/Aurora-AOT/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								systems/Aurora-AOT/sourceme.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,15 @@
 | 
			
		||||
#module load oneapi/release/2023.12.15.001
 | 
			
		||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
 | 
			
		||||
#module load mpich-config/mode/deterministic
 | 
			
		||||
#module load intel_compute_runtime/release/821.35
 | 
			
		||||
 | 
			
		||||
source ~/spack/share/spack/setup-env.sh 
 | 
			
		||||
spack load c-lime
 | 
			
		||||
spack load openssl
 | 
			
		||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
 | 
			
		||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export http_proxy=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export https_proxy=http://proxy.alcf.anl.gov:3128
 | 
			
		||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
 | 
			
		||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
							
								
								
									
										74
									
								
								systems/Aurora-AOT/tests/reproBigJob.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								systems/Aurora-AOT/tests/reproBigJob.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,74 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
#PBS -l select=512
 | 
			
		||||
#PBS -q EarlyAppAccess
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
#PBS -l walltime=6:00:00
 | 
			
		||||
#PBS -N reproBigJob
 | 
			
		||||
#PBS -k doe
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
#unset OMP_PLACES
 | 
			
		||||
 | 
			
		||||
#module load oneapi/eng-compiler/2023.05.15.003
 | 
			
		||||
#module load mpich/51.2/icc-all-deterministic-pmix-gpu
 | 
			
		||||
 | 
			
		||||
# 56 cores / 6 threads ~9
 | 
			
		||||
export OMP_NUM_THREADS=6
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
 | 
			
		||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
 | 
			
		||||
export GRID_PRINT_ENTIRE_LOG=0
 | 
			
		||||
export GRID_CHECKSUM_RECV_BUF=0
 | 
			
		||||
export GRID_CHECKSUM_SEND_BUF=0
 | 
			
		||||
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
			
		||||
#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
DIR=reproBigJob.$PBS_JOBID
 | 
			
		||||
 | 
			
		||||
mkdir -p $DIR
 | 
			
		||||
cd $DIR
 | 
			
		||||
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
BINARY=../Test_dwf_mixedcg_prec
 | 
			
		||||
 | 
			
		||||
echo > pingjob <<EOF
 | 
			
		||||
while read node ; 
 | 
			
		||||
do
 | 
			
		||||
	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
 | 
			
		||||
done < nodefile
 | 
			
		||||
EOF
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 6144 -ppn 12  -envall --hostfile nodefile \
 | 
			
		||||
	     ../gpu_tile_compact.sh \
 | 
			
		||||
	     $BINARY --mpi 8.8.8.12 --grid 128.128.128.288 \
 | 
			
		||||
	--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --debug-stdout --log Message --debug-signals --comms-overlap"
 | 
			
		||||
 | 
			
		||||
echo $CMD > command-line
 | 
			
		||||
env > environment
 | 
			
		||||
$CMD
 | 
			
		||||
grep Oops Grid.stderr.* > failures.$PBS_JOBID
 | 
			
		||||
rm core.*
 | 
			
		||||
@@ -5,63 +5,34 @@
 | 
			
		||||
#PBS -l walltime=00:20:00
 | 
			
		||||
#PBS -A LatticeQCD_aesp_CNDA
 | 
			
		||||
 | 
			
		||||
#export OMP_PROC_BIND=spread
 | 
			
		||||
#unset OMP_PLACES
 | 
			
		||||
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
 | 
			
		||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
$CMD | tee usqcd.log
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
 | 
			
		||||
$CMD | tee 1tile.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.32.32.32.48.dwf
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.64.32.96.dwf
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 12 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 1node.64.32.32.48.dwf
 | 
			
		||||
#for f in 1 2 3 4 5 6 7 8
 | 
			
		||||
for f in 1
 | 
			
		||||
do
 | 
			
		||||
echo $CMD
 | 
			
		||||
$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -11,17 +11,16 @@
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
source ../sourceme.sh
 | 
			
		||||
module load pti-gpu
 | 
			
		||||
#module load pti-gpu
 | 
			
		||||
 | 
			
		||||
#cat $PBS_NODEFILE
 | 
			
		||||
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
export OMP_NUM_THREADS=4
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
			
		||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
			
		||||
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
@@ -34,22 +33,26 @@ export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
# 12 ppn, 2 nodes, 24 ranks
 | 
			
		||||
#
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./gpu_tile.sh \
 | 
			
		||||
	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
 | 
			
		||||
$CMD | tee 2node.comms
 | 
			
		||||
#$CMD | tee 2node.comms.hbm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
 | 
			
		||||
$CMD | tee 2node.32.32.64.48.dwf
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
 | 
			
		||||
 | 
			
		||||
#for f in 1 2 3 4 5 6 7 8
 | 
			
		||||
for f in 1
 | 
			
		||||
do
 | 
			
		||||
echo $CMD
 | 
			
		||||
$CMD | tee 2node.32.32.64.48.dwf.hbm.$f
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
			
		||||
	     ./gpu_tile_compact.sh \
 | 
			
		||||
	     ./gpu_tile.sh \
 | 
			
		||||
	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
 | 
			
		||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
 | 
			
		||||
$CMD | tee 2node.64.64.64.96.dwf
 | 
			
		||||
		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
			
		||||
#$CMD | tee 2node.64.64.64.96.dwf.hbm
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,6 @@
 | 
			
		||||
module load oneapi/release/2023.12.15.001
 | 
			
		||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
 | 
			
		||||
#module load mpich-config/mode/deterministic
 | 
			
		||||
#module load intel_compute_runtime/release/821.35
 | 
			
		||||
source ~/spack/share/spack/setup-env.sh 
 | 
			
		||||
spack load c-lime
 | 
			
		||||
 
 | 
			
		||||
@@ -15,13 +15,13 @@
 | 
			
		||||
 | 
			
		||||
# 56 cores / 6 threads ~9
 | 
			
		||||
export OMP_NUM_THREADS=6
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
			
		||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
			
		||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
			
		||||
 | 
			
		||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 | 
			
		||||
@@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 | 
			
		||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
			
		||||
 | 
			
		||||
export GRID_PRINT_ENTIRE_LOG=0
 | 
			
		||||
export GRID_CHECKSUM_RECV_BUF=1
 | 
			
		||||
export GRID_CHECKSUM_SEND_BUF=1
 | 
			
		||||
export GRID_CHECKSUM_RECV_BUF=0
 | 
			
		||||
export GRID_CHECKSUM_SEND_BUF=0
 | 
			
		||||
 | 
			
		||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
			
		||||
 | 
			
		||||
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
			
		||||
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
 | 
			
		||||
#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
 | 
			
		||||
#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
 | 
			
		||||
 | 
			
		||||
cd $PBS_O_WORKDIR
 | 
			
		||||
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
DIR=reproBigJob.$PBS_JOBID
 | 
			
		||||
 | 
			
		||||
mkdir -p $DIR
 | 
			
		||||
@@ -51,10 +53,19 @@ cd $DIR
 | 
			
		||||
 | 
			
		||||
cp $PBS_NODEFILE nodefile
 | 
			
		||||
 | 
			
		||||
BINARY=../Test_dwf_mixedcg_prec
 | 
			
		||||
 | 
			
		||||
echo > pingjob <<EOF
 | 
			
		||||
while read node ; 
 | 
			
		||||
do
 | 
			
		||||
	echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
 | 
			
		||||
done < nodefile
 | 
			
		||||
EOF
 | 
			
		||||
 | 
			
		||||
CMD="mpiexec -np 384 -ppn 12  -envall --hostfile nodefile \
 | 
			
		||||
	     ../gpu_tile_compact.sh \
 | 
			
		||||
	     ../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96  \
 | 
			
		||||
		--shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
 | 
			
		||||
	     $BINARY --mpi 4.4.4.6 --grid 128.128.128.96  \
 | 
			
		||||
		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
 | 
			
		||||
 | 
			
		||||
echo $CMD > command-line
 | 
			
		||||
env > environment
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user