mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge commit '20a091c3eddfdb67a82ece6413740a93650a2f98' into feature/feynman-rules
This commit is contained in:
		@@ -244,6 +244,9 @@ case ${ac_COMMS} in
 | 
			
		||||
     mpi)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
 | 
			
		||||
     ;;
 | 
			
		||||
     shmem)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
 | 
			
		||||
     ;;
 | 
			
		||||
@@ -253,6 +256,7 @@ case ${ac_COMMS} in
 | 
			
		||||
esac
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 | 
			
		||||
 | 
			
		||||
############### RNG selection
 | 
			
		||||
 
 | 
			
		||||
@@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <mm_malloc.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
extern "C" { 
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
extern void * shmem_align(size_t, size_t);
 | 
			
		||||
extern void  shmem_free(void *);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -65,28 +57,85 @@ public:
 | 
			
		||||
  typedef _Tp        value_type;
 | 
			
		||||
 | 
			
		||||
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
 | 
			
		||||
 | 
			
		||||
  alignedAllocator() throw() { }
 | 
			
		||||
 | 
			
		||||
  alignedAllocator(const alignedAllocator&) throw() { }
 | 
			
		||||
 | 
			
		||||
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
 | 
			
		||||
 | 
			
		||||
  ~alignedAllocator() throw() { }
 | 
			
		||||
 | 
			
		||||
  pointer       address(reference __x)       const { return &__x; }
 | 
			
		||||
  //  const_pointer address(const_reference __x) const { return &__x; }
 | 
			
		||||
 | 
			
		||||
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
			
		||||
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0)
 | 
			
		||||
  { 
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    _Tp tmp;
 | 
			
		||||
#ifdef GRID_NUMA
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
  for(int i=0;i<__n;i++){
 | 
			
		||||
    ptr[i]=tmp;
 | 
			
		||||
  }
 | 
			
		||||
#endif 
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _mm_free((void *)__p); 
 | 
			
		||||
#else
 | 
			
		||||
    free((void *)__p);
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
  void construct(pointer __p, const _Tp& __val) { };
 | 
			
		||||
  void construct(pointer __p) { };
 | 
			
		||||
  void destroy(pointer __p) { };
 | 
			
		||||
};
 | 
			
		||||
template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 | 
			
		||||
template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// MPI3 : comms must use shm region
 | 
			
		||||
// SHMEM: comms must use symmetric heap
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
 | 
			
		||||
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
extern "C" { 
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
extern void * shmem_align(size_t, size_t);
 | 
			
		||||
extern void  shmem_free(void *);
 | 
			
		||||
}
 | 
			
		||||
#define PARANOID_SYMMETRIC_HEAP
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
template<typename _Tp>
 | 
			
		||||
class commAllocator {
 | 
			
		||||
public: 
 | 
			
		||||
  typedef std::size_t     size_type;
 | 
			
		||||
  typedef std::ptrdiff_t  difference_type;
 | 
			
		||||
  typedef _Tp*       pointer;
 | 
			
		||||
  typedef const _Tp* const_pointer;
 | 
			
		||||
  typedef _Tp&       reference;
 | 
			
		||||
  typedef const _Tp& const_reference;
 | 
			
		||||
  typedef _Tp        value_type;
 | 
			
		||||
 | 
			
		||||
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
 | 
			
		||||
  commAllocator() throw() { }
 | 
			
		||||
  commAllocator(const commAllocator&) throw() { }
 | 
			
		||||
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
 | 
			
		||||
  ~commAllocator() throw() { }
 | 
			
		||||
  pointer       address(reference __x)       const { return &__x; }
 | 
			
		||||
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0)
 | 
			
		||||
  {
 | 
			
		||||
#ifdef CRAY
 | 
			
		||||
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef PARANOID_SYMMETRIC_HEAP
 | 
			
		||||
    static void * bcast;
 | 
			
		||||
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
 | 
			
		||||
@@ -99,51 +148,53 @@ public:
 | 
			
		||||
      BACKTRACEFILE();
 | 
			
		||||
      exit(0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    assert( bcast == (void *) ptr);
 | 
			
		||||
 | 
			
		||||
#endif 
 | 
			
		||||
#else
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
    shmem_free((void *)__p);
 | 
			
		||||
  }
 | 
			
		||||
#elif defined(GRID_COMMS_MPI3)
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0)
 | 
			
		||||
  { 
 | 
			
		||||
#error "implement MPI3 windowed allocate"
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
#error "implement MPI3 windowed allocate"
 | 
			
		||||
  }
 | 
			
		||||
#else
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0) 
 | 
			
		||||
  {
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
    _Tp tmp;
 | 
			
		||||
#ifdef GRID_NUMA
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
  for(int i=0;i<__n;i++){
 | 
			
		||||
    ptr[i]=tmp;
 | 
			
		||||
  }
 | 
			
		||||
#endif 
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
    shmem_free((void *)__p);
 | 
			
		||||
#else
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _mm_free((void *)__p); 
 | 
			
		||||
#else
 | 
			
		||||
    free((void *)__p);
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  void construct(pointer __p, const _Tp& __val) { };
 | 
			
		||||
  void construct(pointer __p) { };
 | 
			
		||||
 | 
			
		||||
  void destroy(pointer __p) { };
 | 
			
		||||
};
 | 
			
		||||
template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 | 
			
		||||
template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
 | 
			
		||||
 | 
			
		||||
template<typename _Tp>  inline bool
 | 
			
		||||
operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 | 
			
		||||
 | 
			
		||||
template<typename _Tp>  inline bool
 | 
			
		||||
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Template typedefs
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 | 
			
		||||
template<class T> using commVector = std::vector<T,commAllocator<T> >;              
 | 
			
		||||
template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 | 
			
		||||
    
 | 
			
		||||
}; // namespace Grid
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_MPI3
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 | 
			
		||||
#endif 
 | 
			
		||||
 
 | 
			
		||||
@@ -3,6 +3,10 @@ if BUILD_COMMS_MPI
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_MPI3
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi3.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_SHMEM
 | 
			
		||||
  extra_sources+=communicator/Communicator_shmem.cc
 | 
			
		||||
endif
 | 
			
		||||
 
 | 
			
		||||
@@ -71,7 +71,7 @@
 | 
			
		||||
 namespace Grid {
 | 
			
		||||
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
 | 
			
		||||
Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
 | 
			
		||||
{
 | 
			
		||||
  table.resize(0);
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
@@ -109,7 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
 | 
			
		||||
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,commVector<cobj> &buffer,
 | 
			
		||||
			   compressor &compress, int off,int so)
 | 
			
		||||
{
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
@@ -119,7 +119,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 | 
			
		||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 | 
			
		||||
			     double &t_table ,double & t_data )
 | 
			
		||||
{
 | 
			
		||||
  std::vector<std::pair<int,int> > table;
 | 
			
		||||
@@ -347,10 +347,10 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 | 
			
		||||
       }
 | 
			
		||||
 | 
			
		||||
       // Comms buffers
 | 
			
		||||
       std::vector<Vector<scalar_object> > u_simd_send_buf;
 | 
			
		||||
       std::vector<Vector<scalar_object> > u_simd_recv_buf;
 | 
			
		||||
       Vector<cobj>          u_send_buf;
 | 
			
		||||
       Vector<cobj>          comm_buf;
 | 
			
		||||
       std::vector<commVector<scalar_object> > u_simd_send_buf;
 | 
			
		||||
       std::vector<commVector<scalar_object> > u_simd_recv_buf;
 | 
			
		||||
       commVector<cobj>          u_send_buf;
 | 
			
		||||
       commVector<cobj>          comm_buf;
 | 
			
		||||
       int u_comm_offset;
 | 
			
		||||
       int _unified_buffer_size;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_MPI3
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
#include <mpp/shmem.h>
 | 
			
		||||
#endif
 | 
			
		||||
@@ -52,6 +55,29 @@ class CartesianCommunicator {
 | 
			
		||||
#ifdef GRID_COMMS_MPI
 | 
			
		||||
    MPI_Comm communicator;
 | 
			
		||||
    typedef MPI_Request CommsRequest_t;
 | 
			
		||||
#elif  GRID_COMMS_MPI3
 | 
			
		||||
    MPI_Comm communicator;
 | 
			
		||||
    typedef MPI_Request CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
    const int MAXLOG2RANKSPERNODE = 16; // 65536 ranks per node adequate for now
 | 
			
		||||
 | 
			
		||||
    std::vector<int>  WorldDims;
 | 
			
		||||
    std::vector<int>  GroupDims;
 | 
			
		||||
    std::vector<int>  ShmDims;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> GroupCoor;
 | 
			
		||||
    std::vector<int> ShmCoor;
 | 
			
		||||
    std::vector<int> WorldCoor;
 | 
			
		||||
 | 
			
		||||
    int GroupRank;
 | 
			
		||||
    int ShmRank;
 | 
			
		||||
    int WorldRank;
 | 
			
		||||
 | 
			
		||||
    int GroupSize;
 | 
			
		||||
    int ShmSize;
 | 
			
		||||
    int WorldSize;
 | 
			
		||||
 | 
			
		||||
    std::vector<int>  LexicographicToWorldRank;
 | 
			
		||||
#else 
 | 
			
		||||
    typedef int CommsRequest_t;
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										358
									
								
								lib/communicator/Communicator_mpi3.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										358
									
								
								lib/communicator/Communicator_mpi3.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,358 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/communicator/Communicator_mpi.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include "Grid.h"
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
// Global used by Init and nowhere else. How to hide?
 | 
			
		||||
int Rank(void) {
 | 
			
		||||
  int pe;
 | 
			
		||||
  MPI_Comm_rank(MPI_COMM_WORLD,&pe);
 | 
			
		||||
  return pe;
 | 
			
		||||
}
 | 
			
		||||
  // Should error check all MPI calls.
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  int flag;
 | 
			
		||||
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
			
		||||
  if ( !flag ) {
 | 
			
		||||
    MPI_Init(argc,argv);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Want to implement some magic ... Group sub-cubes into those on same node
 | 
			
		||||
  //
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> coor = _processor_coor;
 | 
			
		||||
 | 
			
		||||
  assert(std::abs(shift) <_processors[dim]);
 | 
			
		||||
 | 
			
		||||
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,source,_processors);
 | 
			
		||||
  source = LexicographicToWorldRank[source];
 | 
			
		||||
 | 
			
		||||
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,dest,_processors);
 | 
			
		||||
  dest = LexicographicToWorldRank[dest];
 | 
			
		||||
}
 | 
			
		||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  int rank;
 | 
			
		||||
  Lexicographic::IndexFromCoor(coor,rank,_processors);
 | 
			
		||||
  rank = LexicographicToWorldRank[rank];
 | 
			
		||||
  return rank;
 | 
			
		||||
}
 | 
			
		||||
void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  Lexicographic::CoorFromIndex(coor,rank,_processors);
 | 
			
		||||
  rank = LexicographicToWorldRank[rank];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{ 
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  std::cout << "Creating "<< _ndimension << " dim communicator "<<std::endl;
 | 
			
		||||
  for(int d =0;d<_ndimension;d++){
 | 
			
		||||
    std::cout << processors[d]<<" ";
 | 
			
		||||
  };
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
  WorldDims = processors;
 | 
			
		||||
 | 
			
		||||
  communicator = MPI_COMM_WORLD;
 | 
			
		||||
  MPI_Comm shmcomm;
 | 
			
		||||
  MPI_Comm_split_type(communicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&shmcomm);
 | 
			
		||||
  MPI_Comm_rank(communicator,&WorldRank);
 | 
			
		||||
  MPI_Comm_size(communicator,&WorldSize);
 | 
			
		||||
  MPI_Comm_rank(shmcomm     ,&ShmRank);
 | 
			
		||||
  MPI_Comm_size(shmcomm     ,&ShmSize);
 | 
			
		||||
  GroupSize = WorldSize/ShmSize;
 | 
			
		||||
 | 
			
		||||
  std::cout<< "Ranks per node "<< ShmSize << std::endl;
 | 
			
		||||
  std::cout<< "Nodes          "<< GroupSize << std::endl;
 | 
			
		||||
  std::cout<< "Ranks          "<< WorldSize << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Assert power of two shm_size.
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  int log2size = -1;
 | 
			
		||||
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
 | 
			
		||||
    if ( (0x1<<i) == ShmSize ) {
 | 
			
		||||
      log2size = i;
 | 
			
		||||
      break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  assert(log2size != -1);
 | 
			
		||||
  
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Identify subblock of ranks on node spreading across dims
 | 
			
		||||
  // in a maximally symmetrical way
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  int dim = 0;
 | 
			
		||||
  
 | 
			
		||||
  ShmDims.resize(_ndimension,1);
 | 
			
		||||
  GroupDims.resize(_ndimension);
 | 
			
		||||
  
 | 
			
		||||
  ShmCoor.resize(_ndimension);
 | 
			
		||||
  GroupCoor.resize(_ndimension);
 | 
			
		||||
  WorldCoor.resize(_ndimension);
 | 
			
		||||
  for(int l2=0;l2<log2size;l2++){
 | 
			
		||||
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
 | 
			
		||||
    ShmDims[dim]*=2;
 | 
			
		||||
    dim=(dim+1)%_ndimension;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "Shm group dims "<<std::endl;
 | 
			
		||||
  for(int d =0;d<_ndimension;d++){
 | 
			
		||||
    std::cout << ShmDims[d]<<" ";
 | 
			
		||||
  };
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Establish torus of processes and nodes with sub-blockings
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  for(int d=0;d<_ndimension;d++){
 | 
			
		||||
    GroupDims[d] = WorldDims[d]/ShmDims[d];
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << "Group dims "<<std::endl;
 | 
			
		||||
  for(int d =0;d<_ndimension;d++){
 | 
			
		||||
    std::cout << GroupDims[d]<<" ";
 | 
			
		||||
  };
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  MPI_Group WorldGroup, ShmGroup;
 | 
			
		||||
  MPI_Comm_group (communicator, &WorldGroup); 
 | 
			
		||||
  MPI_Comm_group (shmcomm, &ShmGroup);
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> world_ranks(WorldSize); 
 | 
			
		||||
  std::vector<int> group_ranks(WorldSize); 
 | 
			
		||||
  std::vector<int> mygroup(GroupSize);
 | 
			
		||||
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
 | 
			
		||||
  
 | 
			
		||||
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &group_ranks[0]); 
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Check processor counts match
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  _Nprocessors=1;
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
  _processor_coor.resize(_ndimension);
 | 
			
		||||
  for(int i=0;i<_ndimension;i++){
 | 
			
		||||
    std::cout << " p " << _processors[i]<<std::endl;
 | 
			
		||||
    _Nprocessors*=_processors[i];
 | 
			
		||||
  }
 | 
			
		||||
  std::cout << " World " <<WorldSize <<" Nproc "<<_Nprocessors<<std::endl;
 | 
			
		||||
  assert(WorldSize==_Nprocessors);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Identify who is in my group and noninate the leader
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  int g=0;
 | 
			
		||||
  for(int rank=0;rank<WorldSize;rank++){
 | 
			
		||||
    if(group_ranks[rank]!=MPI_UNDEFINED){
 | 
			
		||||
	  mygroup[g] = rank;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  std::sort(mygroup.begin(),mygroup.end(),std::greater<int>());
 | 
			
		||||
  int myleader = mygroup[0];
 | 
			
		||||
  
 | 
			
		||||
  std::vector<int> leaders_1hot(WorldSize,0);
 | 
			
		||||
  std::vector<int> leaders_group(GroupSize,0);
 | 
			
		||||
  leaders_1hot [ myleader ] = 1;
 | 
			
		||||
      
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  // global sum leaders over comm world
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  // find the group leaders world rank
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  int group=0;
 | 
			
		||||
  for(int l=0;l<WorldSize;l++){
 | 
			
		||||
    if(leaders_1hot[l]){
 | 
			
		||||
      leaders_group[group++] = l;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Identify the rank of the group in which I (and my leader) live
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  GroupRank=-1;
 | 
			
		||||
  for(int g=0;g<GroupSize;g++){
 | 
			
		||||
    if (myleader == leaders_group[g]){
 | 
			
		||||
      GroupRank=g;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  assert(GroupRank!=-1);
 | 
			
		||||
      
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Establish mapping between lexico physics coord and WorldRank
 | 
			
		||||
  // 
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  LexicographicToWorldRank.resize(WorldSize,0);
 | 
			
		||||
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
 | 
			
		||||
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
 | 
			
		||||
  for(int d=0;d<_ndimension;d++){
 | 
			
		||||
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
 | 
			
		||||
  }
 | 
			
		||||
  _processor_coor = WorldCoor;
 | 
			
		||||
 | 
			
		||||
  int lexico;
 | 
			
		||||
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
 | 
			
		||||
  LexicographicToWorldRank[lexico]=WorldRank;
 | 
			
		||||
  _processor = lexico;
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  // global sum Lexico to World mapping
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////
 | 
			
		||||
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
			
		||||
					   int dest,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int from,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> reqs(0);
 | 
			
		||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromComplete(reqs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int sender,
 | 
			
		||||
					   int receiver,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Status stat;
 | 
			
		||||
  assert(sender != receiver);
 | 
			
		||||
  int tag = sender;
 | 
			
		||||
  if ( _processor == sender ) {
 | 
			
		||||
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
 | 
			
		||||
  }
 | 
			
		||||
  if ( _processor == receiver ) { 
 | 
			
		||||
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
  int rank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
  
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
 | 
			
		||||
  list.push_back(xrq);
 | 
			
		||||
  list.push_back(rrq);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  int nreq=list.size();
 | 
			
		||||
  std::vector<MPI_Status> status(nreq);
 | 
			
		||||
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
 | 
			
		||||
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Barrier(void)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Barrier(communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Bcast(data,
 | 
			
		||||
		     bytes,
 | 
			
		||||
		     MPI_BYTE,
 | 
			
		||||
		     root,
 | 
			
		||||
		     communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr= MPI_Bcast(data,
 | 
			
		||||
		      bytes,
 | 
			
		||||
		      MPI_BYTE,
 | 
			
		||||
		      root,
 | 
			
		||||
		      MPI_COMM_WORLD);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -45,7 +45,7 @@ public:
 | 
			
		||||
// Gather for when there is no need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj,class cobj,class compressor> void 
 | 
			
		||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 | 
			
		||||
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
@@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*n2;
 | 
			
		||||
	cobj temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
 | 
			
		||||
	extract<cobj>(temp,pointers,offset);
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
@@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
  } else { 
 | 
			
		||||
 | 
			
		||||
    assert(0); //Fixme think this is buggy
 | 
			
		||||
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o=n*rhs._grid->_slice_stride[dimension];
 | 
			
		||||
@@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
 | 
			
		||||
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  SimpleCompressor<vobj> dontcompress;
 | 
			
		||||
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
 | 
			
		||||
@@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Scatter for when there is no need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
 | 
			
		||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 | 
			
		||||
  assert(shift<fd);
 | 
			
		||||
  
 | 
			
		||||
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
 | 
			
		||||
  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
 | 
			
		||||
  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
 | 
			
		||||
  commVector<vobj> send_buf(buffer_size);
 | 
			
		||||
  commVector<vobj> recv_buf(buffer_size);
 | 
			
		||||
 | 
			
		||||
  int cb= (cbmask==0x2)? Odd : Even;
 | 
			
		||||
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
@@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
 | 
			
		||||
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 | 
			
		||||
  int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
 | 
			
		||||
  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
 | 
			
		||||
 | 
			
		||||
  int bytes = buffer_size*sizeof(scalar_object);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -65,9 +65,6 @@ public:
 | 
			
		||||
    
 | 
			
		||||
class LatticeExpressionBase {};
 | 
			
		||||
 | 
			
		||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 | 
			
		||||
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1>                           
 | 
			
		||||
class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 | 
			
		||||
 public:
 | 
			
		||||
 
 | 
			
		||||
@@ -45,7 +45,7 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 | 
			
		||||
    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 | 
			
		||||
    commVector<SiteHalfSpinor> &buf, int sF,
 | 
			
		||||
    int sU, const FermionField &in, FermionField &out) {
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
@@ -222,7 +222,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 | 
			
		||||
    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 | 
			
		||||
    commVector<SiteHalfSpinor> &buf, int sF,
 | 
			
		||||
    int sU, const FermionField &in, FermionField &out) {
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
@@ -398,7 +398,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 | 
			
		||||
template <class Impl>
 | 
			
		||||
void WilsonKernels<Impl>::DiracOptDhopDir(
 | 
			
		||||
    StencilImpl &st, DoubledGaugeField &U,
 | 
			
		||||
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 | 
			
		||||
    commVector<SiteHalfSpinor> &buf, int sF,
 | 
			
		||||
    int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
 | 
			
		||||
  SiteHalfSpinor tmp;
 | 
			
		||||
  SiteHalfSpinor chi;
 | 
			
		||||
 
 | 
			
		||||
@@ -58,7 +58,7 @@ namespace Grid {
 | 
			
		||||
      typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
 | 
			
		||||
	DiracOptDhopSite(
 | 
			
		||||
			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
			 commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
			 int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
			 FermionField &out) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -89,7 +89,7 @@ namespace Grid {
 | 
			
		||||
	  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 | 
			
		||||
	  DiracOptDhopSite(
 | 
			
		||||
			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
			   commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
			   int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
			   FermionField &out) {
 | 
			
		||||
	  for (int site = 0; site < Ns; site++) {
 | 
			
		||||
@@ -107,7 +107,7 @@ namespace Grid {
 | 
			
		||||
				  void>::type
 | 
			
		||||
	  DiracOptDhopSiteDag(
 | 
			
		||||
			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
			      commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
			      int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
			      FermionField &out) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -139,7 +139,7 @@ namespace Grid {
 | 
			
		||||
				      void>::type
 | 
			
		||||
				      DiracOptDhopSiteDag(
 | 
			
		||||
							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
							  commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
							  int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
							  FermionField &out) {
 | 
			
		||||
					for (int site = 0; site < Ns; site++) {
 | 
			
		||||
@@ -154,7 +154,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
				    void DiracOptDhopDir(
 | 
			
		||||
							 StencilImpl &st, DoubledGaugeField &U,
 | 
			
		||||
							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
							 commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
 | 
			
		||||
							 int gamma);
 | 
			
		||||
 | 
			
		||||
@@ -162,34 +162,34 @@ namespace Grid {
 | 
			
		||||
				    // Specialised variants
 | 
			
		||||
				    void DiracOptGenericDhopSite(
 | 
			
		||||
								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
								 commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
								 int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
				    void DiracOptGenericDhopSiteDag(
 | 
			
		||||
								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
								    commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
								    int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
				    void DiracOptAsmDhopSite(
 | 
			
		||||
							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
							     commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
							     int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
							     FermionField &out);
 | 
			
		||||
 | 
			
		||||
				    void DiracOptAsmDhopSiteDag(
 | 
			
		||||
								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
								commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
								int sF, int sU, int Ls, int Ns, const FermionField &in,
 | 
			
		||||
								FermionField &out);
 | 
			
		||||
 | 
			
		||||
				    void DiracOptHandDhopSite(
 | 
			
		||||
							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
							      commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
							      int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
				    void DiracOptHandDhopSiteDag(
 | 
			
		||||
								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 | 
			
		||||
								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 | 
			
		||||
								 commVector<SiteHalfSpinor> &buf,
 | 
			
		||||
								 int sF, int sU, const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
	public:
 | 
			
		||||
 
 | 
			
		||||
@@ -40,14 +40,14 @@ namespace Grid {
 | 
			
		||||
    ///////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
                             commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
    {
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
                                commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
                                int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
    {
 | 
			
		||||
      assert(0);
 | 
			
		||||
@@ -86,14 +86,14 @@ namespace Grid {
 | 
			
		||||
#undef KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							 commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
      
 | 
			
		||||
#define KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							    commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
				    
 | 
			
		||||
@@ -111,14 +111,14 @@ namespace Grid {
 | 
			
		||||
#undef KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								  commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
				    
 | 
			
		||||
#define KERNEL_DAG
 | 
			
		||||
    template<>
 | 
			
		||||
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
			
		||||
								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								     commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
			
		||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
			
		||||
				    
 | 
			
		||||
@@ -127,10 +127,10 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_ASM(A)\
 | 
			
		||||
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
 | 
			
		||||
                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
                                   commVector<SiteHalfSpinor>  &buf,\
 | 
			
		||||
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 | 
			
		||||
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
 | 
			
		||||
                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
                                   commVector<SiteHalfSpinor>  &buf,\
 | 
			
		||||
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -313,7 +313,7 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename Simd::scalar_type S;
 | 
			
		||||
@@ -556,7 +556,7 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
					       commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout << "Hand op Dhop "<<std::endl;
 | 
			
		||||
@@ -804,7 +804,7 @@ namespace QCD {
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
@@ -812,7 +812,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Leb
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
@@ -820,7 +820,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
@@ -828,7 +828,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Leb
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								commVector<SiteHalfSpinor>  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
@@ -841,10 +841,10 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_THEM(A) \
 | 
			
		||||
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
							       commVector<SiteHalfSpinor>  &buf,\
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);\
 | 
			
		||||
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
 | 
			
		||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 | 
			
		||||
								  commVector<SiteHalfSpinor>  &buf,\
 | 
			
		||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_THEM(WilsonImplF);
 | 
			
		||||
 
 | 
			
		||||
@@ -42,6 +42,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid{
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  template<class vtype>
 | 
			
		||||
  union uconv {
 | 
			
		||||
    __m512 f;
 | 
			
		||||
    vtype v;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  union u512f {
 | 
			
		||||
    __m512 v;
 | 
			
		||||
    float f[8];
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  union u512d {
 | 
			
		||||
    __m512 v;
 | 
			
		||||
    double f[4];
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
    //Complex float
 | 
			
		||||
    inline __m512 operator()(float a, float b){
 | 
			
		||||
@@ -372,8 +388,9 @@ namespace Optimization {
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
 | 
			
		||||
  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
 | 
			
		||||
#undef GNU_CLANG_COMPILER 
 | 
			
		||||
#ifdef GNU_CLANG_COMPILER
 | 
			
		||||
 | 
			
		||||
#ifndef __INTEL_COMPILER
 | 
			
		||||
#warning "Slow reduction due to incomplete reduce intrinsics"
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
 | 
			
		||||
 
 | 
			
		||||
@@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
  }
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
@@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
  }}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user