Shmem comms [NO MPI] target added. The dwf test runs and passes.

Not really shaken out to my satisfaction though as I want more tests done, so don't declare as working. But committing my current while I try a few experimentals.
2025-08-02 04:37:06 +01:00 · 2016-02-14 14:24:38 -06:00
parent 294dbf1bf0
commit 41c2b09184
14 changed files with 157 additions and 31 deletions
--- a/2
+++ b/2
@@ -5898,7 +5898,7 @@ _ACEOF
       echo Configuring for SHMEM communications

 cat >>confdefs.h <<\_ACEOF
-#define GRID_COMMS_MPI 1
+#define GRID_COMMS_SHMEM 1
 _ACEOF

     ;;
--- a/configure.ac
+++ b/configure.ac
@@ -180,7 +180,7 @@ case ${ac_COMMS} in
     ;;
     shmem)
       echo Configuring for SHMEM communications
-       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_SHMEM] )
+       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -75,18 +75,30 @@ public:

  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }

-  pointer allocate(size_type __n, const void* = 0)
+  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef GRID_COMMS_SHMEM
+
+    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+
+
+#define PARANOID_SYMMETRIC_HEAP
+#ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];

-    shmem_barrier_all(); 
-    _Tp *ptr = (_Tp *) shmem_align(128,__n*sizeof(_Tp));
-    shmem_barrier_all();
-    bcast = (void *) _Tp;
+    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
-    assert( bcast == (void *) _Tp);
+
+    if ( bcast != ptr ) {
+      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
+      BACKTRACEFILE();
+      exit(0);
+    }
+
+    assert( bcast == (void *) ptr);
+
+#endif 
 #else

 #ifdef HAVE_MM_MALLOC_H
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -15,12 +15,15 @@
 /* EMPTY_SIMD only for DEBUGGING */
 #undef EMPTY_SIMD

-/* GRID_COMMS_SHMEM */
+/* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI

 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE

+/* GRID_COMMS_SHMEM */
+#undef GRID_COMMS_SHMEM
+
 /* GRID_DEFAULT_PRECISION is DOUBLE */
 #undef GRID_DEFAULT_PRECISION_DOUBLE

--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <cshift/Cshift_mpi.h>
 #endif 
+
+#ifdef GRID_COMMS_SHMEM
+#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#endif 
 #endif
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -47,9 +47,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define __X86_64

-#ifdef HAVE_EXECINFO_H
-#include <execinfo.h>
-#endif

 namespace Grid {

@@ -174,9 +171,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
-#ifdef GRID_COMMS_MPI
-  MPI_Init(argc,argv);
-#endif
+  CartesianCommunicator::Init(argc,argv);
+
  // Parse command line args.

  GridLogger::StopWatch.Start();
@@ -284,7 +280,6 @@ double usecond(void) {
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }

-#define _NBACKTRACE (256)
 void * Grid_backtrace_buffer[_NBACKTRACE];

 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -78,13 +78,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void)
 {
-#ifdef GRID_COMMS_MPI
  int me;
+#ifdef GRID_COMMS_MPI
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+#endif
+#ifdef GRID_COMMS_SHMEM
+  me = shmem_my_pe();
+#endif
  if ( me ) { 
    std::cout.setstate(std::ios::badbit);
  }
-#endif
 }

 void Grid_unquiesce_nodes(void)
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -29,9 +29,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
+
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+
 namespace Grid {

 // Dress the output; use std::chrono for time stamping via the StopWatch class
+int Rank(void); // used for early stage debug before library init


 class Logger {
@@ -89,5 +95,35 @@ extern GridLogger GridLogPerformance;
 extern GridLogger GridLogIterative  ;
 extern GridLogger GridLogIntegrator  ;

+
+#define _NBACKTRACE (256)
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+
+#ifdef HAVE_EXECINFO_H
+#define BACKTRACEFILE() {\
+    char string[20];					\
+    std::sprintf(string,"backtrace.%d",Rank());				\
+    std::FILE * fp = std::fopen(string,"w");				\
+    BACKTRACEFP(fp)\
+    std::fclose(fp);	    \
+}
+#define BACKTRACE() BACKTRACE(std::stdout) 
+#define BACKTRACEFP(fp) { \
+  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+  for (int i = 0; i < symbols; i++){\
+    std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+  }\
+}
+#else 
+#define BACKTRACE() BACKTRACE(std::stdout);
+
+#define BACKTRACEFP(fp) { \
+  for (int i = 0; i < 4; i++){\
+    std::fprintf (fp,"BT %d %lx\n",i, __builtin_return_address(i); std::fflush(fp); \
+  }\
+}
+#endif
+
 }
 #endif
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -114,15 +114,19 @@ namespace Grid {
      }

      void Communicate(void ) { 
+	typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
+	std::vector<CommsRequest_t> reqs(0);
 	commtime-=usecond();
 	for(int i=0;i<Packets.size();i++){
-	  _grid->SendToRecvFrom(Packets[i].send_buf,
+	  _grid->SendToRecvFromBegin(reqs,
+				Packets[i].send_buf,
 				Packets[i].to_rank,
 				Packets[i].recv_buf,
 				Packets[i].from_rank,
 				Packets[i].bytes);
 	  Packets[i].done = 1;
 	}
+	_grid->SendToRecvFromComplete(reqs);
 	commtime+=usecond();
      }

@@ -648,7 +652,7 @@ PARALLEL_FOR_LOOP
 	      int recv_from_rank;
 	      int xmit_to_rank;
 	      _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-	      assert (xmit_to_rank != _grid->ThisRank());
+	      assert (xmit_to_rank   != _grid->ThisRank());
 	      assert (recv_from_rank != _grid->ThisRank());

 	      //      FIXME Implement asynchronous send & also avoid buffer copy
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -56,6 +56,8 @@ class CartesianCommunicator {
    typedef int CommsRequest_t;
 #endif

+    static void Init(int *argc, char ***argv);
+
    // Constructor
    CartesianCommunicator(const std::vector<int> &pdimensions_in);

--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -31,6 +31,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

  // Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  MPI_Init(argc,argv);
+}
+
+  int Rank(void) {
+    int pe;
+    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+    return pe;
+  }

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {

+void CartesianCommunicator::Init(int *argc, char *** arv)
+{
+}
+
+int Rank(void ){ return 0 };
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -31,7 +31,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

  // Should error check all MPI calls.
+#define SHMEM_VET(addr) 

+#define SHMEM_VET_DEBUG(addr) {				\
+  if ( ! shmem_addr_accessible(addr,_processor) ) {\
+    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
+    BACKTRACEFILE();		   \
+  }\
+}
+  int Rank(void) {
+    return shmem_my_pe();
+  }
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  shmem_init();
+}
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
@@ -41,8 +54,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);

-  //  shmem_init_thread(SHMEM_THREAD_FUNNELED);
-  start_pes(0);
  _processor = shmem_my_pe();
  
  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
@@ -50,10 +61,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
-  if ( _processor == 0 ) {
-    printf("I'm running SHMEM communications %d  \n",_processor);
-  }
+
  int Size = shmem_n_pes(); 
+
+
  assert(Size==_Nprocessors);
 }

@@ -85,6 +96,12 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];

+  // Inefficient, but don't want to dynamic alloc
+  if ( shmem_addr_accessible(f,_processor)  ){
+    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
+    return;
+  }
+
  for(int i=0;i<N;i++){
    source = f[i];
    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@@ -108,6 +125,11 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];

+  if ( shmem_addr_accessible(d,_processor)  ){
+    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
+    return;
+  }
+
  for(int i=0;i<N;i++){
    source = d[i];
    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
@@ -117,12 +139,13 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
+
  assert(std::abs(shift) <_processors[dim]);

-  coor[dim] = (coor[dim] + shift + _processors[dim])%_processors[dim];
+  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);

-  coor[dim] = (coor[dim] - shift + _processors[dim])%_processors[dim];
+  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);

 }
@@ -144,6 +167,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int from,
 					   int bytes)
 {
+  SHMEM_VET(xmit);
+  SHMEM_VET(recv);
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
@@ -171,6 +196,9 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
+  SHMEM_VET(xmit);
+  SHMEM_VET(recv);
+  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -185,14 +213,37 @@ void CartesianCommunicator::Barrier(void)
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static uint32_t word;
+  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
-  shmem_broadcast32(data,data,bytes/4,root,0,0,_Nprocessors,psync);
+  int words = bytes/4;
+  
+  for(int w=0;w<words;w++){
+    word = array[w];
+    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
+    if ( shmem_my_pe() != root ) {
+      array[w] = word;
+    }
+    shmem_barrier_all();
+  }
+
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static uint32_t word;
+  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
-  shmem_broadcast32(data,data,bytes/4,root,0,0,shmem_n_pes(),psync);
+  int words = bytes/4;
+
+  for(int w=0;w<words;w++){
+    word = array[w];
+    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
+    if ( shmem_my_pe() != root ) {
+      array[w]= word;
+    }
+    shmem_barrier_all();
+  }
 }

 }
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -191,8 +191,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
-  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+
  int bytes = buffer_size*sizeof(scalar_object);

  std::vector<scalar_object *>  pointers(Nsimd); //