Grid/lib/communicator/Communicator_base.h

    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

    Source file: ./lib/communicator/Communicator_base.h

    Copyright (C) 2015

Author: Peter Boyle <paboyle@ph.ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H

///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h>
#endif
namespace Grid {
class CartesianCommunicator {
  public:    

  // Communicator should know nothing of the physics grid, only processor grid.

    int              _Nprocessors;     // How many in all
    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
    int              _processor;       // linear processor rank
    std::vector<int> _processor_coor;  // linear processor coordinate
    unsigned long _ndimension;

#ifdef GRID_COMMS_MPI
    MPI_Comm communicator;
    typedef MPI_Request CommsRequest_t;
#else 
    typedef int CommsRequest_t;
#endif

    static void Init(int *argc, char ***argv);

    // Constructor
    CartesianCommunicator(const std::vector<int> &pdimensions_in);

    // Wraps MPI_Cart routines
    void ShiftedRanks(int dim,int shift,int & source, int & dest);
    int  RankFromProcessorCoor(std::vector<int> &coor);
    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);

    /////////////////////////////////
    // Grid information queries
    /////////////////////////////////
    int                      IsBoss(void)            { return _processor==0; };
    int                      BossRank(void)          { return 0; };
    int                      ThisRank(void)          { return _processor; };
    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
    int                      ProcessorCount(void)    { return _Nprocessors; };

    ////////////////////////////////////////////////////////////
    // Reduction
    ////////////////////////////////////////////////////////////
    void GlobalSum(RealF &);
    void GlobalSumVector(RealF *,int N);

    void GlobalSum(RealD &);
    void GlobalSumVector(RealD *,int N);

    void GlobalSum(uint32_t &);

    void GlobalSum(ComplexF &c)
    {
      GlobalSumVector((float *)&c,2);
    }
    void GlobalSumVector(ComplexF *c,int N)
    {
      GlobalSumVector((float *)c,2*N);
    }

    void GlobalSum(ComplexD &c)
    {
      GlobalSumVector((double *)&c,2);
    }
    void GlobalSumVector(ComplexD *c,int N)
    {
      GlobalSumVector((double *)c,2*N);
    }
    
    template<class obj> void GlobalSum(obj &o){
      typedef typename obj::scalar_type scalar_type;
      int words = sizeof(obj)/sizeof(scalar_type);
      scalar_type * ptr = (scalar_type *)& o;
      GlobalSumVector(ptr,words);
    }
    ////////////////////////////////////////////////////////////
    // Face exchange, buffer swap in translational invariant way
    ////////////////////////////////////////////////////////////
    void SendToRecvFrom(void *xmit,
			int xmit_to_rank,
			void *recv,
			int recv_from_rank,
			int bytes);

    void SendRecvPacket(void *xmit,
			void *recv,
			int xmit_to_rank,
			int recv_from_rank,
			int bytes);

    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
			 void *xmit,
			 int xmit_to_rank,
			 void *recv,
			 int recv_from_rank,
			 int bytes);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
    // Barrier
    ////////////////////////////////////////////////////////////
    void Barrier(void);

    ////////////////////////////////////////////////////////////
    // Broadcast a buffer and composite larger
    ////////////////////////////////////////////////////////////
    void Broadcast(int root,void* data, int bytes);
    template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };

    static void BroadcastWorld(int root,void* data, int bytes);

}; 
}

#endif
Global edit adding copyright and license info to every source file. 2016-01-02 14:51:32 +00:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/communicator/Communicator_base.h`

			`Copyright (C) 2015`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`#ifndef GRID_COMMUNICATOR_BASE_H`
			`#define GRID_COMMUNICATOR_BASE_H`

			`///////////////////////////////////`
			`// Processor layout information`
			`///////////////////////////////////`
			`#ifdef GRID_COMMS_MPI`
			`#include <mpi.h>`
			`#endif`
Updates for shmem 2016-02-11 00:50:32 +00:00			`#ifdef GRID_COMMS_SHMEM`
			`#include <mpp/shmem.h>`
			`#endif`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`namespace Grid {`
			`class CartesianCommunicator {`
			`public:`

			`// Communicator should know nothing of the physics grid, only processor grid.`

			`int _Nprocessors; // How many in all`
			`std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.`
			`int _processor; // linear processor rank`
			`std::vector<int> _processor_coor; // linear processor coordinate`
			`unsigned long _ndimension;`

			`#ifdef GRID_COMMS_MPI`
			`MPI_Comm communicator;`
			`typedef MPI_Request CommsRequest_t;`
			`#else`
			`typedef int CommsRequest_t;`
			`#endif`

Shmem comms [NO MPI] target added. The dwf test runs and passes. Not really shaken out to my satisfaction though as I want more tests done, so don't declare as working. But committing my current while I try a few experimentals. 2016-02-14 20:24:38 +00:00			`static void Init(int argc, char **argv);`

Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`// Constructor`
Domain wall fermions now invert ; have the basis set up for Tanh/Zolo * (Cayley/PartFrac/ContFrac) * (Mobius/Shamir/Wilson) Approx Representation Kernel. All are done with space-time taking part in checkerboarding, Ls uncheckerboarded Have only so far tested the Domain Wall limit of mobius, and at that only checked that it i) Inverts ii) 5dim DW == Ls copies of 4dim D2 iii) MeeInv Mee == 1 iv) Meo+Mee+Moe+Moo == M unprec. v) MpcDagMpc is hermitan vi) Mdag is the adjoint of M between stochastic vectors. That said, the RB schur solve, RB MpcDagMpc solve, Unprec solve all converge and the true residual becomes small; so pretty good tests. 2015-06-02 16:57:12 +01:00			`CartesianCommunicator(const std::vector<int> &pdimensions_in);`
Moving some things around for pretty 2015-05-11 19:09:49 +01:00
			`// Wraps MPI_Cart routines`
			`void ShiftedRanks(int dim,int shift,int & source, int & dest);`
			`int RankFromProcessorCoor(std::vector<int> &coor);`
			`void ProcessorCoorFromRank(int rank,std::vector<int> &coor);`

			`/////////////////////////////////`
			`// Grid information queries`
			`/////////////////////////////////`
			`int IsBoss(void) { return _processor==0; };`
			`int BossRank(void) { return 0; };`
			`int ThisRank(void) { return _processor; };`
			`const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };`
			`const std::vector<int> & ProcessorGrid(void) { return _processors; };`
			`int ProcessorCount(void) { return _Nprocessors; };`

			`////////////////////////////////////////////////////////////`
			`// Reduction`
			`////////////////////////////////////////////////////////////`
			`void GlobalSum(RealF &);`
			`void GlobalSumVector(RealF *,int N);`

			`void GlobalSum(RealD &);`
			`void GlobalSumVector(RealD *,int N);`

			`void GlobalSum(uint32_t &);`

			`void GlobalSum(ComplexF &c)`
			`{`
			`GlobalSumVector((float *)&c,2);`
			`}`
			`void GlobalSumVector(ComplexF *c,int N)`
			`{`
			`GlobalSumVector((float )c,2N);`
			`}`

			`void GlobalSum(ComplexD &c)`
			`{`
			`GlobalSumVector((double *)&c,2);`
			`}`
			`void GlobalSumVector(ComplexD *c,int N)`
			`{`
			`GlobalSumVector((double )c,2N);`
			`}`

			`template<class obj> void GlobalSum(obj &o){`
			`typedef typename obj::scalar_type scalar_type;`
			`int words = sizeof(obj)/sizeof(scalar_type);`
			`scalar_type * ptr = (scalar_type *)& o;`
			`GlobalSumVector(ptr,words);`
			`}`
			`////////////////////////////////////////////////////////////`
			`// Face exchange, buffer swap in translational invariant way`
			`////////////////////////////////////////////////////////////`
			`void SendToRecvFrom(void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes);`
Binary IO file for generic Grid array parallel I/O. Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 1848 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance. 2015-08-26 13:40:29 +01:00
Parallel IO worked on. I'm puzzled because I already thought I shook this out on MacOS + OpenMPI and then turned up problems on the BlueWaters Cray. Gets 75MB/s from home filesystem on parallel configuration read. Need to make the RNG IO parallel, and also to look at aggregating bigger writes for the parallel write. Not sure what the home filesystem is. 2016-02-21 14:03:21 +00:00			`void SendRecvPacket(void *xmit,`
			`void *recv,`
			`int xmit_to_rank,`
			`int recv_from_rank,`
			`int bytes);`
Binary IO file for generic Grid array parallel I/O. Number of IO MPI tasks can be varied by selecting which dimensions use parallel IO and which dimensions use Serial send to boss I/O. Thus can neck down from, say 1024 nodes = 4x4x8x8 to {1,8,32,64,128,256,1024} nodes doing the I/O. Interpolates nicely between ALL nodes write their data, a single boss per time-plane in processor space [old UKQCD fortran code did this], and a single node doing all I/O. Not sure I have the transfer sizes big enough and am not overly convinced fstream is guaranteed to not give buffer inconsistencies unless I set streambuf size to zero. Practically it has worked on 8 tasks, 2x1x2x2 writing /cloning NERSC configurations on my MacOS + OpenMPI and Clang environment. It is VERY easy to switch to pwrite at a later date, and also easy to send x-strips around from each node in order to gather bigger chunks at the syscall level. That would push us up to the circa 8x 1848 == 4KB size write chunk, and by taking, say, x/y non parallel we get to 16MB contiguous chunks written in multi 4KB transactions per IOnode in 64^3 lattices for configuration I/O. I suspect this is fine for system performance. 2015-08-26 13:40:29 +01:00
Moving some things around for pretty 2015-05-11 19:09:49 +01:00			`void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,`
			`void *xmit,`
			`int xmit_to_rank,`
			`void *recv,`
			`int recv_from_rank,`
			`int bytes);`
			`void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);`

			`////////////////////////////////////////////////////////////`
			`// Barrier`
			`////////////////////////////////////////////////////////////`
			`void Barrier(void);`

			`////////////////////////////////////////////////////////////`
			`// Broadcast a buffer and composite larger`
			`////////////////////////////////////////////////////////////`
			`void Broadcast(int root,void* data, int bytes);`
			`template<class obj> void Broadcast(int root,obj &data)`
			`{`
			`Broadcast(root,(void *)&data,sizeof(data));`
			`};`

			`static void BroadcastWorld(int root,void* data, int bytes);`

			`};`
			`}`

			`#endif`