Grid/lib/communicator/Communicator_base.h


    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/communicator/Communicator_base.h

    Copyright (C) 2015

Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H

///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPIT
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h>
#endif

namespace Grid {

class CartesianCommunicator {
  public:    


  ////////////////////////////////////////////
  // Isend/Irecv/Wait, or Sendrecv blocking
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }

  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  ///////////////////////////////////////////
  static const int MAXLOG2RANKSPERNODE = 16;            
  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;

#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;

  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;

  typedef MPI_Request CommsRequest_t;

#else 
  typedef int CommsRequest_t;
#endif


  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3

  static int ShmRank;
  static int ShmSize;
  static int GroupRank;
  static int GroupSize;
  static int WorldRank;
  static int WorldSize;

  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;

  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  
  std::vector<int>  LexicographicToWorldRank;
  
  static std::vector<void *> ShmCommBufs;

#else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
#endif 

  /////////////////////////////////
  // Grid information and queries
  // Implemented in Communicator_base.C
  /////////////////////////////////
  static void * ShmCommBuf;

  
  size_t heap_top;
  size_t heap_bytes;

  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void ShmBufferFreeAll(void) ;
  
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);

  ////////////////////////////////////////////////
  // Constructors to sub-divide a parent communicator
  // and default to comm world
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent);
  CartesianCommunicator(const std::vector<int> &pdimensions_in);

 private:
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
  ////////////////////////////////////////////////
  // Private initialise from an MPI communicator
  // Can use after an MPI_Comm_split, but hidden from user so private
  ////////////////////////////////////////////////
  void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
#endif
 public:
  
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
  void ShiftedRanks(int dim,int shift,int & source, int & dest);
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  
  int                      Dimensions(void)        ;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  int                      NodeCount(void)    ;
  int                      RankCount(void)    ;

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
  void GlobalSumVector(ComplexD *c,int N);
  void GlobalXOR(uint32_t &);
  void GlobalXOR(uint64_t &);
  
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
    int words = sizeof(obj)/sizeof(scalar_type);
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
  void SendToRecvFrom(void *xmit,
		      int xmit_to_rank,
		      void *recv,
		      int recv_from_rank,
		      int bytes);
  
  void SendRecvPacket(void *xmit,
		      void *recv,
		      int xmit_to_rank,
		      int recv_from_rank,
		      int bytes);
  
  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
			   void *xmit,
			   int xmit_to_rank,
			   void *recv,
			   int recv_from_rank,
			   int bytes);
  
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

  double StencilSendToRecvFrom(void *xmit,
			       int xmit_to_rank,
			       void *recv,
			       int recv_from_rank,
			       int bytes,int dir);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
				    void *xmit,
				    int xmit_to_rank,
				    void *recv,
				    int recv_from_rank,
				    int bytes,int dir);
  
  
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);

  ////////////////////////////////////////////////////////////
  // Barrier
  ////////////////////////////////////////////////////////////
  void Barrier(void);
  
  ////////////////////////////////////////////////////////////
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);

  ////////////////////////////////////////////////////////////
  // All2All down one dimension
  ////////////////////////////////////////////////////////////
  template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
    assert(dim>=0);
    assert(dim<_ndimension);
    int numnode = _processors[dim];
    //    std::cerr << " AllToAll in.size()  "<<in.size()<<std::endl;
    //    std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
    assert(in.size()==out.size());
    size_t bytes=(in.size()*sizeof(T))/numnode;
    assert((bytes*numnode) == in.size()*sizeof(T));
    AllToAll(dim,(void *)&in[0],(void *)&out[0],bytes);
  }
  void AllToAll(int dim  ,void *in,void *out,int bytes);
  void AllToAll(void  *in,void *out,int bytes);
  
  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };

}; 
}

#endif
Comms improvements 2016-11-01 11:35:43 +00:00
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/communicator/Communicator_base.h`

			`Copyright (C) 2015`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`#ifndef GRID_COMMUNICATOR_BASE_H`
			`#define GRID_COMMUNICATOR_BASE_H`

			`///////////////////////////////////`
			`// Processor layout information`
			`///////////////////////////////////`
			`#ifdef GRID_COMMS_MPI`
			`#include <mpi.h>`
			`#endif`
MPI3 and shared memory support 2016-10-20 16:58:01 +01:00			`#ifdef GRID_COMMS_MPI3`
			`#include <mpi.h>`
			`#endif`
Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00			`#ifdef GRID_COMMS_MPIT`
Comms improvements 2016-11-01 11:35:43 +00:00			`#include <mpi.h>`
			`#endif`
Updates for shmem 2016-02-11 00:50:32 +00:00			`#ifdef GRID_COMMS_SHMEM`
			`#include <mpp/shmem.h>`
			`#endif`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`namespace Grid {`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`class CartesianCommunicator {`
			`public:`

Trying to pass TeamCity and Travis 2017-08-20 01:10:50 +01:00
			`////////////////////////////////////////////`
			`// Isend/Irecv/Wait, or Sendrecv blocking`
			`////////////////////////////////////////////`
			`enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };`
			`static CommunicatorPolicy_t CommunicatorPolicy;`
			`static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }`

			`///////////////////////////////////////////`
			`// Up to 65536 ranks per node adequate for now`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`// 128MB shared memory for comms enought for 48^4 local vol comms`
			`// Give external control (command line override?) of this`
Trying to pass TeamCity and Travis 2017-08-20 01:10:50 +01:00			`///////////////////////////////////////////`
Threaded MPI calls patches 2017-07-29 18:06:53 +01:00			`static const int MAXLOG2RANKSPERNODE = 16;`
			`static uint64_t MAX_MPI_SHM_BYTES;`
			`static int nCommThreads;`
Trying to pass TeamCity and Travis 2017-08-20 01:10:50 +01:00			`// use explicit huge pages`
			`static int Hugepages;`
MPI3 and shared memory support 2016-10-20 16:58:01 +01:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`// Communicator should know nothing of the physics grid, only processor grid.`
			`int _Nprocessors; // How many in all`
			`std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.`
			`int _processor; // linear processor rank`
			`std::vector<int> _processor_coor; // linear processor coordinate`
			`unsigned long _ndimension;`

Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00			`#if defined (GRID_COMMS_MPI) \|\| defined (GRID_COMMS_MPI3) \|\| defined (GRID_COMMS_MPIT)`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`static MPI_Comm communicator_world;`
Moving multicommunicator into mpi3 also for threading 2017-08-20 02:39:10 +01:00
			`MPI_Comm communicator;`
			`std::vector<MPI_Comm> communicator_halo;`

Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`typedef MPI_Request CommsRequest_t;`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`#else`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`typedef int CommsRequest_t;`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`#endif`

Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`////////////////////////////////////////////////////////////////////`
			`// Helper functionality for SHM Windows common to all other impls`
			`////////////////////////////////////////////////////////////////////`
			`// Longer term; drop this in favour of a master / slave model with`
			`// cartesian communicator on a subset of ranks, slave ranks controlled`
			`// by group leader with data xfer via shared memory`
			`////////////////////////////////////////////////////////////////////`
Comms improvements 2016-11-01 11:35:43 +00:00			`#ifdef GRID_COMMS_MPI3`

			`static int ShmRank;`
			`static int ShmSize;`
			`static int GroupRank;`
			`static int GroupSize;`
			`static int WorldRank;`
			`static int WorldSize;`

Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`std::vector<int> WorldDims;`
			`std::vector<int> GroupDims;`
			`std::vector<int> ShmDims;`

			`std::vector<int> GroupCoor;`
			`std::vector<int> ShmCoor;`
			`std::vector<int> WorldCoor;`
Comms improvements 2016-11-01 11:35:43 +00:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`static std::vector<int> GroupRanks;`
			`static std::vector<int> MyGroup;`
			`static int ShmSetup;`
			`static MPI_Win ShmWindow;`
			`static MPI_Comm ShmComm;`

			`std::vector<int> LexicographicToWorldRank;`

			`static std::vector<void *> ShmCommBufs;`
Comms improvements 2016-11-01 11:35:43 +00:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`#else`
			`static void ShmInitGeneric(void);`
			`static commVector<uint8_t> ShmBufStorageVector;`
			`#endif`
Comms improvements 2016-11-01 11:35:43 +00:00
			`/////////////////////////////////`
			`// Grid information and queries`
			`// Implemented in Communicator_base.C`
			`/////////////////////////////////`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`static void * ShmCommBuf;`
Communicator Policy, NodeCount distinct from Rank count 2017-02-07 06:22:53 +00:00
Threaded MPI calls patches 2017-07-29 18:06:53 +01:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`size_t heap_top;`
			`size_t heap_bytes;`
Comms improvements 2016-11-01 11:35:43 +00:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`void *ShmBufferSelf(void);`
			`void *ShmBuffer(int rank);`
			`void ShmBufferTranslate(int rank,void local_p);`
			`void *ShmBufferMalloc(size_t bytes);`
Travis fail fix attempt 2016-10-25 01:45:53 +01:00			`void ShmBufferFreeAll(void) ;`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
			`////////////////////////////////////////////////`
			`// Must call in Grid startup`
			`////////////////////////////////////////////////`
			`static void Init(int argc, char **argv);`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`////////////////////////////////////////////////`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00			`// Constructors to sub-divide a parent communicator`
			`// and default to comm world`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`////////////////////////////////////////////////`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00			`CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent);`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`CartesianCommunicator(const std::vector<int> &pdimensions_in);`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00
			`private:`
MPIT works with split grid now 2017-10-02 23:14:56 +01:00			`#if defined (GRID_COMMS_MPI) \|\| defined (GRID_COMMS_MPIT)`
Splitting communicators first cut 2017-06-22 08:14:34 +01:00			`////////////////////////////////////////////////`
			`// Private initialise from an MPI communicator`
			`// Can use after an MPI_Comm_split, but hidden from user so private`
			`////////////////////////////////////////////////`
			`void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);`
			`#endif`
			`public:`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
			`////////////////////////////////////////////////////////////////////////////////////////`
			`// Wraps MPI_Cart routines, or implements equivalent on other impls`
			`////////////////////////////////////////////////////////////////////////////////////////`
			`void ShiftedRanks(int dim,int shift,int & source, int & dest);`
			`int RankFromProcessorCoor(std::vector<int> &coor);`
			`void ProcessorCoorFromRank(int rank,std::vector<int> &coor);`

RankCount; need to clean up ambigious ProcessCount 2017-05-30 23:39:16 +01:00			`int Dimensions(void) ;`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`int IsBoss(void) ;`
			`int BossRank(void) ;`
			`int ThisRank(void) ;`
			`const std::vector<int> & ThisProcessorCoor(void) ;`
			`const std::vector<int> & ProcessorGrid(void) ;`
			`int ProcessorCount(void) ;`
Communicator Policy, NodeCount distinct from Rank count 2017-02-07 06:22:53 +00:00			`int NodeCount(void) ;`
RankCount; need to clean up ambigious ProcessCount 2017-05-30 23:39:16 +01:00			`int RankCount(void) ;`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
			`////////////////////////////////////////////////////////////////////////////////`
			`// very VERY rarely (Log, serial RNG) we need world without a grid`
			`////////////////////////////////////////////////////////////////////////////////`
			`static int RankWorld(void) ;`
			`static void BroadcastWorld(int root,void* data, int bytes);`

			`////////////////////////////////////////////////////////////`
			`// Reduction`
			`////////////////////////////////////////////////////////////`
			`void GlobalSum(RealF &);`
			`void GlobalSumVector(RealF *,int N);`
			`void GlobalSum(RealD &);`
			`void GlobalSumVector(RealD *,int N);`
			`void GlobalSum(uint32_t &);`
			`void GlobalSum(uint64_t &);`
			`void GlobalSum(ComplexF &c);`
			`void GlobalSumVector(ComplexF *c,int N);`
			`void GlobalSum(ComplexD &c);`
			`void GlobalSumVector(ComplexD *c,int N);`
I/O improvements 2017-06-11 23:14:10 +01:00			`void GlobalXOR(uint32_t &);`
			`void GlobalXOR(uint64_t &);`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
			`template<class obj> void GlobalSum(obj &o){`
			`typedef typename obj::scalar_type scalar_type;`
			`int words = sizeof(obj)/sizeof(scalar_type);`
			`scalar_type * ptr = (scalar_type *)& o;`
			`GlobalSumVector(ptr,words);`
			`}`

			`////////////////////////////////////////////////////////////`
			`// Face exchange, buffer swap in translational invariant way`
			`////////////////////////////////////////////////////////////`
			`void SendToRecvFrom(void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes);`

			`void SendRecvPacket(void *xmit,`
			`void *recv,`
			`int xmit_to_rank,`
			`int recv_from_rank,`
			`int bytes);`

			`void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,`
			`void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes);`

			`void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);`

Threaded MPI calls patches 2017-07-29 18:06:53 +01:00			`double StencilSendToRecvFrom(void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes,int dir);`

Count bytes off node only 2017-02-20 22:47:40 +00:00			`double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,`
Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00			`void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes,int dir);`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00

Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00			`void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`void StencilBarrier(void);`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00			`////////////////////////////////////////////////////////////`
			`// Barrier`
			`////////////////////////////////////////////////////////////`
			`void Barrier(void);`

			`////////////////////////////////////////////////////////////`
			`// Broadcast a buffer and composite larger`
			`////////////////////////////////////////////////////////////`
			`void Broadcast(int root,void* data, int bytes);`
Comms splitting improvements 2017-10-09 23:16:51 +01:00
			`////////////////////////////////////////////////////////////`
			`// All2All down one dimension`
			`////////////////////////////////////////////////////////////`
			`template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){`
			`assert(dim>=0);`
			`assert(dim<_ndimension);`
			`int numnode = _processors[dim];`
			`// std::cerr << " AllToAll in.size() "<<in.size()<<std::endl;`
			`// std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;`
			`assert(in.size()==out.size());`
			`size_t bytes=(in.size()*sizeof(T))/numnode;`
			`assert((bytesnumnode) == in.size()sizeof(T));`
			`AllToAll(dim,(void )&in[0],(void )&out[0],bytes);`
			`}`
			`void AllToAll(int dim ,void in,void out,int bytes);`
			`void AllToAll(void in,void out,int bytes);`
Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions 2016-10-24 17:30:43 +01:00
			`template<class obj> void Broadcast(int root,obj &data)`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`{`
			`Broadcast(root,(void *)&data,sizeof(data));`
			`};`

			`};`
			`}`

			`#endif`