Trying to pass TeamCity and Travis

2026-06-13 07:33:11 +01:00 · 2017-08-20 01:10:50 +01:00
parent be66e7dd95
commit a446d95c33
7 changed files with 67 additions and 30 deletions
@@ -218,7 +218,7 @@ public:
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
  uint64_t lmax=48;
-#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
@@ -368,7 +368,7 @@ public:
      const int num_cases = 4;
 #endif
      controls Cases [] = {
-#if defined(AVX512) 
+#ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
@@ -380,6 +380,10 @@ public:

      for(int c=0;c<num_cases;c++) {

+	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
@@ -390,10 +394,6 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

-
-	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
@@ -41,6 +41,7 @@ uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;
 CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
+int CartesianCommunicator::Hugepages = 0;

 /////////////////////////////////
 // Alloc, free shmem region
@@ -134,7 +135,10 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
-  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE,  MAP_HUGETLB| MAP_SHARED | MAP_ANONYMOUS, -1, 0); 
+
+  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
+  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
+  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);  
  std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
 #else 
@@ -50,13 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    

-  // 65536 ranks per node adequate for now
+
+  ////////////////////////////////////////////
+  // Isend/Irecv/Wait, or Sendrecv blocking
+  ////////////////////////////////////////////
+  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
+  static CommunicatorPolicy_t CommunicatorPolicy;
+  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
+
+  ///////////////////////////////////////////
+  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
  static const int MAXLOG2RANKSPERNODE = 16;            
  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
+  // use explicit huge pages
+  static int       Hugepages;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -122,10 +133,6 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;

-  // Isend/Irecv/Wait, or Sendrecv blocking
-  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
-  static CommunicatorPolicy_t CommunicatorPolicy;
-  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  
  size_t heap_top;
  size_t heap_bytes;
@@ -41,8 +41,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
+
+// Make up for linex deficiencies
 #ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
+#define SHM_HUGETLB 0x0
+#endif
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x0
 #endif

 namespace Grid {
@@ -213,8 +218,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
+      
+      int mmap_flag = MAP_SHARED;
+      if (Hugepages) mmap_flag |= MAP_HUGETLB;
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);

-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);

@@ -628,8 +636,9 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int bytes,int dir)
 {
  std::vector<CommsRequest_t> list;
-  StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
+  return offbytes;
 }

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -671,7 +680,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }

  return off_node_bytes;
@@ -135,10 +135,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-    RealD NP = _FourDimGrid->_Nprocessors;
-    RealD NN = _FourDimGrid->NodeCount();
+  RealD NP     = _FourDimGrid->_Nprocessors;
+  RealD NN     = _FourDimGrid->NodeCount();
+  RealD volume = Ls;  
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
+  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];

  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@@ -390,17 +391,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  // Rely on async comms; start comms before merge of local data
  double ctime=0;
  double ptime=0;
-  //  DhopComputeTime-=usecond();
-  //  DhopCommTime-=usecond();
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
    int ncomms = CartesianCommunicator::nCommThreads;
-    if (ncomms == -1) ncomms = st.Packets.size(); 
+    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
@@ -252,10 +252,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  //////////////////////////////////////////
  void CommunicateThreaded()
  {
+#ifdef GRID_OMP
    // must be called in parallel region
    int mythread = omp_get_thread_num();
    int nthreads = CartesianCommunicator::nCommThreads;
-    if (nthreads == -1) nthreads = Packets.size();
+#else
+    int mythread = 0;
+    int nthreads = 1;
+#endif
+    if (nthreads == -1) nthreads = 1;
    if (mythread < nthreads) {
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 	double start = usecond();
@@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv)
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }

+  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
+    CartesianCommunicator::Hugepages = 1;
+  }
+
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@@ -356,12 +362,13 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  CartesianCommunicator::nCommThreads = -1;
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--commthreads") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--commthreads");
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
@@ -378,7 +385,10 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  if ( CartesianCommunicator::Hugepages) {
+    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
+  }

  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";