1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Comms improvements

This commit is contained in:
paboyle 2016-11-01 11:35:43 +00:00
parent b1508e4124
commit 791cb050c8
25 changed files with 898 additions and 287 deletions

View File

@ -42,14 +42,13 @@ int main (int argc, char ** argv)
int Nloop=10;
int nmu=0;
for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=32;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
@ -194,6 +193,87 @@ int main (int argc, char ** argv)
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=32;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double start=usecond();
for(int i=0;i<Nloop;i++){
std::vector<CartesianCommunicator::CommsRequest_t> requests;
ncomm=0;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
Grid.StencilSendToRecvFromComplete(requests);
Grid.Barrier();
}
double stop=usecond();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start; // microseconds
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
}
}
#if 0
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;

View File

@ -1,18 +1,12 @@
#!/usr/bin/env bash
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
echo "-- deploying Eigen source..."
wget ${EIGEN_URL} --no-check-certificate
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
rm `basename ${EIGEN_URL}`
echo "-- copying fftw prototypes..."
wget ${FFTW_URL}
./scripts/update_fftw.sh `basename ${FFTW_URL}`
rm `basename ${FFTW_URL}`
echo '-- generating Make.inc files...'
./scripts/filelist
echo '-- generating configure script...'

View File

@ -247,6 +247,9 @@ case ${ac_COMMS} in
mpi3)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
;;
mpi3l)
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
;;
shmem)
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
;;
@ -257,6 +260,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
AM_CONDITIONAL(BUILD_COMMS_MPI3L,[ test "X${ac_COMMS}X" == "Xmpi3lX"] )
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
############### RNG selection

View File

@ -42,6 +42,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif

View File

@ -9,6 +9,11 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_MPI3L
extra_sources+=communicator/Communicator_mpi3_leader.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_SHMEM
extra_sources+=communicator/Communicator_shmem.cc
extra_sources+=communicator/Communicator_base.cc

View File

@ -31,13 +31,6 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
int CartesianCommunicator::Slave;
void * CartesianCommunicator::ShmCommBuf;
/////////////////////////////////
@ -70,12 +63,6 @@ int CartesianCommunicator::ProcessorCount(void) { return
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
int CartesianCommunicator::Ranks (void) { return WorldSize; };
int CartesianCommunicator::Nodes (void) { return GroupSize; };
int CartesianCommunicator::Cores (void) { return ShmSize; };
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
@ -94,7 +81,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N);
}
#ifndef GRID_COMMS_MPI3
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,

View File

@ -1,3 +1,4 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -37,6 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h>
#endif
@ -60,9 +64,9 @@ class CartesianCommunicator {
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
MPI_Comm communicator;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
static MPI_Comm communicator_world;
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
@ -75,7 +79,15 @@ class CartesianCommunicator {
// cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory
////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3
#ifdef GRID_COMMS_MPI3
static int ShmRank;
static int ShmSize;
static int GroupRank;
static int GroupSize;
static int WorldRank;
static int WorldSize;
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
@ -83,7 +95,7 @@ class CartesianCommunicator {
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
@ -93,13 +105,20 @@ class CartesianCommunicator {
std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs;
#else
static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector;
#endif
/////////////////////////////////
// Grid information and queries
// Implemented in Communicator_base.C
/////////////////////////////////
static void * ShmCommBuf;
size_t heap_top;
size_t heap_bytes;
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p);
@ -123,28 +142,12 @@ class CartesianCommunicator {
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
/////////////////////////////////
// Grid information and queries
/////////////////////////////////
static int ShmRank;
static int ShmSize;
static int GroupSize;
static int GroupRank;
static int WorldRank;
static int WorldSize;
static int Slave;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
static int Ranks (void);
static int Nodes (void);
static int Cores (void);
static int NodeRank (void);
static int CoreRank (void);
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid

View File

@ -44,13 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
MPI_Init(argc,argv);
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
}
@ -198,6 +191,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,

View File

@ -30,12 +30,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmSetup = 0;
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
MPI_Comm CartesianCommunicator::communicator_world;
MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow;
@ -97,15 +103,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize);
MyGroup.resize(ShmSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////
int g=0;
MyGroup.resize(ShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize);

View File

@ -0,0 +1,733 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
#include <mpi.h>
namespace Grid {
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
struct Descriptor {
uint64_t buf;
size_t bytes;
int rank;
int tag;
int command;
MPI_Request request;
};
const int pool = 48;
class SlaveState {
public:
volatile int head;
volatile int start;
volatile int tail;
volatile Descriptor Descrs[pool];
};
class Slave {
public:
SlaveState *state;
MPI_Comm squadron;
uint64_t base;
////////////////////////////////////////////////////////////
// Descriptor circular pointers
////////////////////////////////////////////////////////////
Slave() {};
void Init(SlaveState * _state,MPI_Comm _squadron);
void EventLoop (void) {
std::cerr<< " Entering even loop "<<std::endl;
while(1) {
Event();
}
}
void Event (void) ;
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int rank) ;
void WaitAll() {
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
std::cerr<< " Waiting on FIFO drain "<<std::endl;
while ( state->tail != state->head );
std::cerr<< " FIFO drained "<< state->tail <<std::endl;
}
};
////////////////////////////////////////////////////////////////////////
// One instance of a data mover.
// Master and Slave must agree on location in shared memory
////////////////////////////////////////////////////////////////////////
class MPIoffloadEngine {
public:
static std::vector<Slave> Slaves;
static int ShmSetup;
static int UniverseRank;
static int UniverseSize;
static MPI_Comm communicator_universe;
static MPI_Comm communicator_cached;
static MPI_Comm HorizontalComm;
static int HorizontalRank;
static int HorizontalSize;
static MPI_Comm VerticalComm;
static MPI_Win VerticalWindow;
static int VerticalSize;
static int VerticalRank;
static std::vector<void *> VerticalShmBufs;
static std::vector<std::vector<int> > UniverseRanks;
static std::vector<int> UserCommunicatorToWorldRanks;
static MPI_Group WorldGroup, CachedGroup;
static void CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf);
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank);
/////////////////////////////////////////////////////////
// routines for master proc must handle any communicator
/////////////////////////////////////////////////////////
static uint64_t QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
std::cerr<< " Queueing send "<< bytes<<std::endl;
return Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
};
static uint64_t QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
std::cerr<< " Queueing receive "<< bytes<<std::endl;
return Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
};
static void WaitAll() {
for(int s=1;s<VerticalSize;s++) {
Slaves[s].WaitAll();
}
};
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Slave> MPIoffloadEngine::Slaves;
int MPIoffloadEngine::UniverseRank;
int MPIoffloadEngine::UniverseSize;
MPI_Comm MPIoffloadEngine::communicator_universe;
MPI_Comm MPIoffloadEngine::communicator_cached;
MPI_Group MPIoffloadEngine::WorldGroup;
MPI_Group MPIoffloadEngine::CachedGroup;
MPI_Comm MPIoffloadEngine::HorizontalComm;
int MPIoffloadEngine::HorizontalRank;
int MPIoffloadEngine::HorizontalSize;
MPI_Comm MPIoffloadEngine::VerticalComm;
MPI_Win MPIoffloadEngine::VerticalWindow;
int MPIoffloadEngine::VerticalSize;
int MPIoffloadEngine::VerticalRank;
std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
int MPIoffloadEngine::ShmSetup = 0;
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf)
{
int flag;
assert(ShmSetup==0);
//////////////////////////////////////////////////////////////////////
// Universe is all nodes prior to squadron grouping
//////////////////////////////////////////////////////////////////////
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
MPI_Comm_rank(communicator_universe,&UniverseRank);
MPI_Comm_size(communicator_universe,&UniverseSize);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory (Verticals)
/////////////////////////////////////////////////////////////////////
// MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
MPI_Comm_split(communicator_universe,(UniverseRank&0x1),UniverseRank,&VerticalComm);
MPI_Comm_rank(VerticalComm ,&VerticalRank);
MPI_Comm_size(VerticalComm ,&VerticalSize);
//////////////////////////////////////////////////////////////////////
// Split into horizontal groups by rank in squadron
//////////////////////////////////////////////////////////////////////
MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
MPI_Comm_rank(HorizontalComm,&HorizontalRank);
MPI_Comm_size(HorizontalComm,&HorizontalSize);
assert(HorizontalSize*VerticalSize==UniverseSize);
////////////////////////////////////////////////////////////////////////////////
// What is my place in the world
////////////////////////////////////////////////////////////////////////////////
int WorldRank=0;
if(VerticalRank==0) WorldRank = HorizontalRank;
int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
assert(ierr==0);
////////////////////////////////////////////////////////////////////////////////
// Where is the world in the universe?
////////////////////////////////////////////////////////////////////////////////
UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
for(int w=0;w<HorizontalSize;w++){
ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group, pass back Shm info to CartesianCommunicator
//////////////////////////////////////////////////////////////////////////////////////////////////////////
ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
assert(ierr==0);
std::cerr<<"SHM "<<ShmCommBuf<<std::endl;
VerticalShmBufs.resize(VerticalSize);
for(int r=0;r<VerticalSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
//////////////////////////////////////////////////////////////////////
// Map rank of leader on node in their in new world, to the
// rank in this vertical plane's horizontal communicator
//////////////////////////////////////////////////////////////////////
communicator_world = HorizontalComm;
ShmComm = VerticalComm;
ShmCommBuf = VerticalShmBufs[0];
MPI_Comm_group (communicator_world, &WorldGroup);
///////////////////////////////////////////////////////////
// Start the slave data movers
///////////////////////////////////////////////////////////
if ( VerticalRank != 0 ) {
Slave indentured;
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm);
indentured.EventLoop();
assert(0);
} else {
Slaves.resize(VerticalSize);
for(int i=1;i<VerticalSize;i++){
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm);
}
}
///////////////////////////////////////////////////////////
// Verbose for now
///////////////////////////////////////////////////////////
ShmSetup=1;
if (UniverseRank == 0){
std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
std::cout<<UniverseSize << " Ranks " ;
std::cout<<HorizontalSize << " Nodes " ;
std::cout<<VerticalSize << " with ranks-per-node "<<std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
}
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" { ";
for(int s=0;s<VerticalSize;s++){
std::cout<< UniverseRanks[g][s];
if ( s<VerticalSize-1 ) {
std::cout<<",";
}
}
std::cout<<" } "<<std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////
// Map the communicator into communicator_world, and find the neighbour.
// Cache the mappings; cache size is 1.
///////////////////////////////////////////////////////////////////////////////////////////////
void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
if ( comm == HorizontalComm ) {
comm_world_peer = rank;
} else if ( comm == communicator_cached ) {
comm_world_peer = UserCommunicatorToWorldRanks[rank];
} else {
int size;
MPI_Comm_size(comm,&size);
UserCommunicatorToWorldRanks.resize(size);
std::vector<int> cached_ranks(size);
for(int r=0;r<size;r++) {
cached_ranks[r]=r;
}
communicator_cached=comm;
MPI_Comm_group(communicator_cached, &CachedGroup);
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
comm_world_peer = UserCommunicatorToWorldRanks[rank];
assert(comm_world_peer != MPI_UNDEFINED);
}
assert( (tag & (~0xFFFFL)) ==0);
uint64_t icomm = (uint64_t)comm;
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
hashtag = (comm_hash<<15) | tag;
};
void Slave::Init(SlaveState * _state,MPI_Comm _squadron)
{
squadron=_squadron;
state =_state;
state->head = state->tail = state->start = 0;
MPI_Barrier(squadron);
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
int rank; MPI_Comm_rank(_squadron,&rank);
}
#define PERI_PLUS(A) ( (A+1)%pool )
void Slave::Event (void) {
static int tail_last;
static int head_last;
static int start_last;
int ierr;
if ( (state->tail != tail_last)
||(state->head != head_last)
||(state->start != start_last)
) {
std::cerr<< " Event loop "<< state->tail << " "<< state->start<< " "<< state->head <<std::endl;
}
////////////////////////////////////////////////////
// Try to advance the tail pointers
////////////////////////////////////////////////////
/*
int t=state->tail;
if ( t != state->start ) {
int flag=0;
std::cerr<< " Testing tail "<< t<<" "<< (void *)&state->Descrs[t].request
<< " "<<state->Descrs[t].request<<std::endl;
// ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
// ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
assert(ierr==0);
if ( flag ) {
state->tail = PERI_PLUS(t);
std::cerr<< " Tail advanced from "<< t<<std::endl;
return;
}
}
*/
////////////////////////////////////////////////////
// Try to advance the start pointers
////////////////////////////////////////////////////
int s=state->start;
if ( s != state->head ) {
switch ( state->Descrs[s].command ) {
case COMMAND_ISEND:
std::cerr<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
assert(ierr==0);
state->start = PERI_PLUS(s);
break;
case COMMAND_IRECV:
std::cerr<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
assert(ierr==0);
state->start = PERI_PLUS(s);
break;
case COMMAND_WAITALL:
std::cerr<< " Wait all "<<std::endl;
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
std::cerr<< " Wait ["<<t<<"] "<<state->Descrs[t].request <<std::endl;
std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
};
s=PERI_PLUS(s);
state->start = s;
state->tail = s;
break;
default:
assert(0);
break;
}
return;
}
}
//////////////////////////////////////////////////////////////////////////////
// External interaction with the queue
//////////////////////////////////////////////////////////////////////////////
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
{
/////////////////////////////////////////
// Spin; if FIFO is full until not full
/////////////////////////////////////////
int head =state->head;
int next = PERI_PLUS(head);
// Set up descriptor
int worldrank;
int hashtag;
MPI_Comm communicator;
MPI_Request request;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,commrank,tag,comm,worldrank);
int VerticalRank = MPIoffloadEngine::VerticalRank;
uint64_t relative= (uint64_t)buf - base;
state->Descrs[head].buf = relative;
state->Descrs[head].bytes = bytes;
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][VerticalRank];
state->Descrs[head].tag = hashtag;
state->Descrs[head].command= command;
std::cerr<< " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
// Block until FIFO has space
while( state->tail==next );
// Msync on weak order architectures
// Advance pointer
state->head = next;
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
}
communicator_world = MPI_COMM_WORLD;
MPI_Comm ShmComm;
MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size;
MPI_Comm_size(communicator_world,&Size);
assert(Size==_Nprocessors);
_processor_coor.resize(_ndimension);
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank (communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
};
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
uint64_t xmit_i = (uint64_t) xmit;
uint64_t recv_i = (uint64_t) recv;
uint64_t shm = (uint64_t) ShmCommBuf;
// assert xmit and recv lie in shared memory region
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
MPIoffloadEngine::WaitAll();
}
void CartesianCommunicator::StencilBarrier(void)
{
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
};

View File

@ -34,13 +34,6 @@ namespace Grid {
void CartesianCommunicator::Init(int *argc, char *** arv)
{
WorldRank = 0;
WorldSize = 1;
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
}
@ -99,6 +92,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(0);
}
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }

View File

@ -50,11 +50,16 @@ typedef struct HandShake_t {
uint64_t seq_remote;
} HandShake;
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
ret.fill(SHMEM_SYNC_VALUE);
return ret;
}
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init();
XConnections.resize(shmem_n_pes());
@ -65,13 +70,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0;
}
WorldSize = shmem_n_pes();
WorldRank = shmem_my_pe();
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
shmem_barrier_all();
ShmInitGeneric();
}
@ -103,7 +101,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
static long long source ;
static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1;
// int pestart=0;
@ -119,7 +117,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
static long long source ;
static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1;
// int pestart=0;
@ -135,7 +133,7 @@ void CartesianCommunicator::GlobalSum(float &f){
static float source ;
static float dest ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = f;
dest =0.0;
@ -147,7 +145,7 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
static float source ;
static float dest = 0 ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
@ -166,7 +164,7 @@ void CartesianCommunicator::GlobalSum(double &d)
static double source;
static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = d;
dest = 0;
@ -178,7 +176,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
static double source ;
static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
@ -295,7 +294,7 @@ void CartesianCommunicator::Barrier(void)
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);
@ -318,7 +317,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);

View File

@ -1 +0,0 @@
./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7

View File

@ -1,3 +0,0 @@
#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7

View File

@ -1,9 +0,0 @@
for omp in 1 2 4
do
echo > wilson.t$omp
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
do
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
echo $vol $perf >> wilson.t$omp
done
done

View File

@ -1,46 +0,0 @@
#!/bin/bash -e
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
BLACK="\033[30m"
RED="\033[31m"
GREEN="\033[32m"
YELLOW="\033[33m"
BLUE="\033[34m"
PINK="\033[35m"
CYAN="\033[36m"
WHITE="\033[37m"
NORMAL="\033[0;39m"
for D in $DIRS
do
echo
echo -e $RED ==============================
echo -e $GREEN $D
echo -e $RED ==============================
echo -e $BLUE
cd builds/$D
make clean all -j 8
cd ../../
echo -e $NORMAL
done
if [ "X$1" == "Xextra" ]
then
for D in $EXTRADIRS
do
echo
echo -e $RED ==============================
echo -e $RED $D
echo -e $RED ==============================
echo -e $BLUE
cd builds/$D
make clean all -j 8
cd ../../
echo -e $NORMAL
done
fi

View File

@ -1,11 +0,0 @@
#!/bin/bash
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
for D in $DIRS
do
mkdir -p builds/$D
cd builds/$D
../../scripts/configure-commands $D
cd ../..
done

View File

@ -1,89 +0,0 @@
#!/bin/bash
WD=$1
BLACK="\033[30m"
RED="\033[31m"
GREEN="\033[32m"
YELLOW="\033[33m"
BLUE="\033[34m"
PINK="\033[35m"
CYAN="\033[36m"
WHITE="\033[37m"
NORMAL="\033[0;39m"
echo
echo -e $RED ==============================
echo -e $GREEN $WD
echo -e $RED ==============================
echo -e $YELLOW
case $WD in
g++-avx)
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
g++-avx-openmp)
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=none
;;
g++5-sse4)
CXX=g++-5 ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
g++5-avx)
CXX=g++-5 ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx-openmp-mpi)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
icpc-avx-openmp)
CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
icpc-avx2)
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx512)
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-mic)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-mic-avx512)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-sse)
CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx2)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx-openmp)
CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-xc30)
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-xc30-openmp)
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx2-openmp)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx-mpi)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2-mpi)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LDFLAGS="-L/usr/local/lib/" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
esac
echo -e $NORMAL

View File

@ -1,10 +0,0 @@
#!/bin/bash
DIRS="g++-avx-openmp g++-avx clang-xc30 clang-xc30-openmp"
for D in $DIRS
do
mkdir -p builds/$D
cd builds/$D
../../scripts/configure-commands $D
cd ../..
done

View File

@ -1,10 +0,0 @@
#!/bin/bash
DIRS="build-icpc-mic"
for D in $DIRS
do
mkdir -p $D
cd $D
../configure-commands
cd ..
done

View File

@ -12,6 +12,7 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: $1
Copyright (C) 2015
Copyright (C) 2016
EOF
@ -38,8 +39,21 @@ See the full license in the file "LICENSE" in the top level distribution directo
/* END LEGAL */
EOF
cat message > tmp.fil
cat $1 >> tmp.fil
NOTICE=`grep -n "END LEGAL" $1 | awk '{ print $1 }' `
if [ "X$NOTICE" != "X" ]
then
echo "found notice ending on line $NOTICE"
awk 'BEGIN { P=0 } { if ( P ) print } /END LEGAL/{P=1} ' $1 >> tmp.fil
else
cat $1 >> tmp.fil
fi
cp tmp.fil $1
shift

View File

@ -1,2 +0,0 @@
module swap PrgEnv-cray PrgEnv-intel
module swap intel/14.0.4.211 intel/15.0.2.164

View File

@ -1,4 +0,0 @@
aclocal -I m4
autoheader -f
automake -f --add-missing
autoconf -f

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
if (( $# != 1 )); then
echo "usage: `basename $0` <archive>" 1>&2
exit 1
fi
ARC=$1
INITDIR=`pwd`
rm -rf lib/fftw
mkdir lib/fftw
ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
tar -xf ${ARC}
cp ${ARCDIR}/api/fftw3.h lib/fftw/
cd ${INITDIR}
rm -rf ${ARCDIR}

View File

@ -1,7 +0,0 @@
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
set terminal 'pdf'
set output 'wilson_clang.pdf'
replot
quit