1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

MPI is now working and passing basic tests. Will start to construct a more sensible test suite shortly

since testing requirements now go beyond what a single Grid_main.cc can do.

Will need a more organised src tree for this and will require substantial reorg of build system.
This commit is contained in:
Peter Boyle 2015-04-03 04:52:53 +01:00
parent 19b9069453
commit 7b97e50b7b
17 changed files with 1298 additions and 954 deletions

2
Grid.h
View File

@ -52,7 +52,7 @@
namespace dpo {
void Grid_init(void);
void Grid_init(int *argc,char ***argv);
double usecond(void);
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);

View File

@ -41,8 +41,8 @@ public:
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// Global dimensions of array with cb removed
std::vector<int> _gdimensions;// Global dimensions of array
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
@ -88,11 +88,20 @@ public:
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
return idx;
}
inline int iCoordFromIsite(int lane,int mu)
{
std::vector<int> coor(_ndimension);
for(int d=0;d<_ndimension;d++){
coor[d] = lane % _simd_layout[d];
lane = lane / _simd_layout[d];
}
return coor[mu];
}
inline int oSites(void) { return _osites; };
inline int iSites(void) { return _isites; };
inline int CheckerboardFromOsite (int Osite){
inline int CheckerBoardFromOsite (int Osite){
std::vector<int> ocoor;
CoordFromOsite(ocoor,Osite);
int ss=0;
@ -109,6 +118,7 @@ public:
}
}
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(std::vector<int> site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
@ -116,6 +126,9 @@ public:
class GridCartesian: public SimdGrid {
public:
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(std::vector<int> site){
return 0;
}
@ -199,6 +212,10 @@ public:
class GridRedBlackCartesian : public SimdGrid
{
public:
virtual int CheckerBoarded(int dim){
if( dim==0) return 1;
else return 0;
}
virtual int CheckerBoard(std::vector<int> site){
return (site[0]+site[1]+site[2]+site[3])&0x1;
}
@ -215,11 +232,13 @@ public:
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
int ocb=CheckerboardFromOsite(osite);
int ocb=CheckerBoardFromOsite(osite);
if ( (source_cb+ocb)&1 ) {
printf("Checkerboard shift %d\n",(shift)/2);
return (shift)/2;
} else {
printf("Checkerboard shift %d\n",(shift+1)/2);
return (shift+1)/2;
}
}

View File

@ -3,12 +3,16 @@
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
namespace dpo {
class CartesianCommunicator {
public:
// Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
@ -20,7 +24,9 @@ class CartesianCommunicator {
CartesianCommunicator(std::vector<int> &pdimensions_in);
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int Rank(std::vector<int> coor);
int MyRank(void);
void GlobalSumF(float &);
void GlobalSumFVector(float *,int N);
@ -28,9 +34,9 @@ class CartesianCommunicator {
void GlobalSumFVector(double *,int N);
void SendToRecvFrom(void *xmit,
std::vector<int> to_coordinate,
int xmit_to_rank,
void *recv,
std::vector<int> from_coordinate,
int recv_from_rank,
int bytes);
};

View File

@ -23,7 +23,8 @@ public:
SimdGrid *_grid;
int checkerboard;
std::vector<vobj,alignedAllocator<vobj> > _odata;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
public:
@ -36,6 +37,8 @@ public:
}
#include <Grid_cshift_common.h>
#ifdef GRID_COMMS_NONE
#include <Grid_none_cshift.h>
#endif

View File

@ -11,10 +11,10 @@
/* #undef AVX512 */
/* GRID_COMMS_FAKE */
#define GRID_COMMS_FAKE 1
/* #undef GRID_COMMS_FAKE */
/* GRID_COMMS_MPI */
/* #undef GRID_COMMS_MPI */
#define GRID_COMMS_MPI 1
/* GRID_COMMS_NONE */
/* #undef GRID_COMMS_NONE */

View File

@ -9,6 +9,15 @@
/* AVX512 */
#undef AVX512
/* GRID_COMMS_FAKE */
#undef GRID_COMMS_FAKE
/* GRID_COMMS_MPI */
#undef GRID_COMMS_MPI
/* GRID_COMMS_NONE */
#undef GRID_COMMS_NONE
/* Define to 1 if you have the `gettimeofday' function. */
#undef HAVE_GETTIMEOFDAY
@ -60,6 +69,9 @@
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION

384
Grid_cshift_common.h Normal file
View File

@ -0,0 +1,384 @@
#ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
//////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
//////////////////////////////////////////////////////
friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
// printf("Gather plane _simple mask %d\n",cbmask);
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
buffer[bo++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// int jjj=0;
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo]=rhs._odata[so+o+b];
// float * ptr = (float *)& rhs._odata[so+o+b];
// if( (cbmask!=3)&&(jjj<8)){
// printf("Gather_plane_simple %d %le bo %d\n",so+o+b,*ptr,bo);
// jjj++;
// }
bo++;
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
//////////////////////////////////////////////////////
friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
extract(rhs._odata[so+o+b],pointers);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
if ( ocb & cbmask ) {
extract(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
rhs._odata[so+o+b]=buffer[bo++];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
merge(rhs._odata[so+o+b],pointers);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
// if lhs is odd, rhs even??
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
// int jjj=0;
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
if ( ocb&cbmask ) {
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
// float *ptr =(float *) &rhs._odata[ro+o+b];
// if((cbmask!=0x3)&&jjj<8) {
// printf("Copy_plane %d %le n,b=%d,%d mask %d ocb %d\n",ro+o+b,*ptr,n,b,cbmask,ocb);
// jjj++;
// }
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
if ( ocb&cbmask ) {
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//////////////////////////////////////////////////////
// Local to node Cshift
//////////////////////////////////////////////////////
// Work out whether to permute
// ABCDEFGH -> AE BF CG DH permute wrap num
//
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
// Suppose 4way simd in one dim.
// ABCDEFGH -> AECG BFDH permute wrap num
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
// Generalisation to 8 way simd, 16 way simd required.
//
// Need log2 Nway masks. consisting of
// 1 bit 256 bit granule
// 2 bit 128 bit granule
// 4 bits 64 bit granule
// 8 bits 32 bit granules
//
// 15 bits....
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
} else {
Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_ldimensions[dimension];
int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * rhs._grid->_ostride[dimension];
int cb= (cbmask==0x2)? 1 : 0;
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
return ret;
}
#endif

View File

@ -16,8 +16,11 @@
#undef __X86_64
namespace dpo {
void Grid_init(void)
void Grid_init(int *argc,char ***argv)
{
#ifdef GRID_COMMS_MPI
MPI_Init(argc,argv);
#endif
Grid_debug_handler_init();
}
double usecond(void) {

View File

@ -13,34 +13,28 @@ using namespace dpo::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
#if 0
struct sigaction sa,osa;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= sa_action;
sa.sa_flags = SA_ONSTACK|SA_SIGINFO;
sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
#endif
std::vector<int> latt_size(4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout(4);
for(int d=0;d<4;d++) mpi_layout[d]=1;
std::vector<int> latt_size(4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout(4);
mpi_layout[0]=4;
mpi_layout[1]=2;
mpi_layout[2]=4;
mpi_layout[3]=2;
#ifdef AVX512
for(int omp=128;omp<236;omp+=16){
#else
for(int omp=1;omp<8;omp*=2){
for(int omp=1;omp<8;omp*=20){
#endif
#ifdef OMP
omp_set_num_threads(omp);
#endif
for(int lat=4;lat<=16;lat+=40){
for(int lat=16;lat<=16;lat+=40){
latt_size[0] = lat;
latt_size[1] = lat;
latt_size[2] = lat;
@ -234,9 +228,9 @@ int main (int argc, char ** argv)
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
Shifted = Cshift(Foo,dir,shift); // Shift everything
std::cout << "Shifting even parities"<<std::endl;
std::cout << "Shifting even source parities to odd result"<<std::endl;
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
std::cout << "Shifting odd parities"<<std::endl;
std::cout << "Shifting odd parities to even result"<<std::endl;
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
ShiftedCheck=zero;
@ -245,19 +239,19 @@ int main (int argc, char ** argv)
// Check results
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
std::complex<dpo::Real> diff;
std::vector<int> shiftcoor = coor;
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%(latt_size[dir]/mpi_layout[dir]);
std::vector<int> rl(4);
for(int dd=0;dd<4;dd++){
rl[dd] = latt_size[dd]/simd_layout[dd];
rl[dd] = latt_size[dd]/simd_layout[dd]/mpi_layout[dd];
}
int lex = coor[0]%rl[0]
+ (coor[1]%rl[1])*rl[0]
@ -344,4 +338,6 @@ int main (int argc, char ** argv)
} // loop for lat
} // loop for omp
MPI_Finalize();
}

View File

@ -7,16 +7,23 @@ namespace dpo {
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
{
_ndimension = _processors.size();
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
_processor_coords.resize(_ndimension);
MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
_processor_coor.resize(_ndimension);
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
printf("Hello world from processor [");
for(int i=0;i<_ndimension;i++){
printf("%d ",_processor_coor[i]);
_Nprocessors*=_processors[i];
}
printf("]\n");
fflush(stdout);
}
void CartesianCommunicator::GlobalSumF(float &f){
@ -35,11 +42,9 @@ void CartesianCommunicator::GlobalSumFVector(double *d,int N)
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
}
int CartesianCommunicator::ShiftedRank(int dim,int shift)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int rank;
MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
return rank;
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
}
int CartesianCommunicator::Rank(std::vector<int> coor)
{
@ -50,23 +55,18 @@ int CartesianCommunicator::Rank(std::vector<int> coor)
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
std::vector<int> &to_coordinate,
int dest,
void *recv,
std::vector<int> &from_coordinate,
int from,
int bytes)
{
int dest = Rank(to_coordinate);
int from = Rank(from_coordinate);
MPI_Request reqs[2];
MPI_Status OkeyDokey[2];
int rank = _processor;
MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
MPI_Waitall(2,reqs,OkeyDokey);
MPI_Request x_req;
MPI_Request r_req;
MPI_Status OkeyDokey;
MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
MPI_Wait(&x_req,&OkeyDokey);
MPI_Wait(&r_req,&OkeyDokey);
MPI_Barrier();
}
}

View File

@ -1,26 +1,9 @@
#ifndef _GRID_MPI_CSHIFT_H_
#define _GRID_MPI_CSHIFT_H_
//////////////////////////////////////////////
// Q. Split this into seperate sub functions?
//////////////////////////////////////////////
// CshiftCB_comms_splice
// CshiftCB_comms
// CshiftCB_local
// CshiftCB_local_permute
// Cshift_comms_splice
// Cshift_comms
// Cshift_local
// Cshift_local_permute
// Broadly I remain annoyed that the iteration is so painful
// for red black data layout, when simple block strided descriptors suffice for non-cb.
//
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
//
#define MAX(x,y) ((x)>(y)?(x):(y))
#define MIN(x,y) ((x)>(y)?(y):(x))
//////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so probably need
// additional stencil support.
@ -33,79 +16,31 @@
//
// Grid could create a neighbour index table for a given stencil.
// Could also implement CovariantCshift.
//////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////
//Non checkerboarded support functions
//////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Q. Further split this into separate sub functions?
/////////////////////////////////////////////////////////////
friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)
{
const int Nsimd = vector_type::Nsimd();
int rd = rhs._grid->_rdimensions[dimension];
// CshiftCB_local
// CshiftCB_local_permute
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[ro+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//
//////////////////////////////////////////////////////
//Checkerboarded support functions
//////////////////////////////////////////////////////
//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
// Cshift_comms_splice
// Cshift_comms
// Cshift_local
// Cshift_local_permute
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
const int Nsimd = vector_type::Nsimd();
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
//int ld = rhs._grid->_ldimensions[dimension];
//int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
@ -115,270 +50,288 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
// Logic for non-distributed dimension
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_to (simd_layout);
std::vector<int> comm_from (simd_layout);
std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane
std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
///////////////////////////////////////////////
// Move via a fake comms buffer
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<scalar_type *> pointers(Nsimd);
if ( permute_dim ) {
for(int x=0;x<rd;x++){
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
int o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[ro+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
if ( !comm_dim ) {
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
}
for(int x=0;x<rd;x++){
///////////////////////////////////////////
// Extract one orthogonal slice at a time
///////////////////////////////////////////
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
o = 0; // relative offset to base
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
extract(rhs._odata[so+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
for(int s=0;s<simd_layout;s++) {
// shift to "neighbour" takes us off node
// coordinates (rx, simd_lane) of neighbour
// how many nodes away is this shift
// where we should send to
// where we should receive from
int shifted_x = x+s*rd+shift;
comm_offnode[s] = shifted_x > ld;
comm_send_rx[s] = shifted_x%rd; // which slice geton the other node
comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node
comm_from[s] = shifted_x/ld;
comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];
}
////////////////////////////////////////////////
// Insert communication phase
////////////////////////////////////////////////
#if 0
} else if (comm_dim ) {
// Packed gather sequence is clean
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
// off node; communcate slice (ld==rd)
if ( x+shift > rd ) {
int sb=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
send_buf[sb++]=rhs._odata[so+i];
}
so+=rhs._grid->_slice_stride[dimension];
}
// Make a comm_fake them mimics comms in periodic case.
// scatter face
int rb=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[so+i]=recv_buf[rb++];
}
so+=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=rhs._odata[so+i];
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
}
#endif
////////////////////////////////////////////////
// Pull receive buffers and permuted buffers in
////////////////////////////////////////////////
for(int i=0;i<vobj::vector_type::Nsimd();i++){
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
}
o = 0; // relative offset to base
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
// base offset for source
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
merge(ret._odata[ro+o+b],pointers);
}
}
o +=rhs._grid->_slice_stride[dimension];
}
}
} else if ( comm_dim ) {
int co; // comm offset
int o;
co=0;
for(int x=0;x<rd;x++){
o=0;
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
comm_buf[co++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
// Step through a copy into a comms buffer and pull back in.
// Genuine fake implementation could calculate if loops back
co=0; o=0;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
ret._odata[ro+o+b]=comm_buf[co++];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
} else { // Local dimension, no permute required
for(int x=0;x<rd;x++){
int o=0;
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
}
friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
// printf("Cshift_comms : single pass\n");
Cshift_comms(ret,rhs,dimension,shift,0x3);
} else {
// printf("Cshift_comms : two pass\n");
// printf("call1\n");
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
// printf("call2\n");
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
// printf("done\n");
}
}
friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
// printf("call1 0x1 cb=even\n");
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
// printf("call2 0x2 cb=odd\n");
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
// printf("done\n");
}
}
friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
SimdGrid *grid=rhs._grid;
Lattice<vobj> temp(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
// Packed gather sequence is clean
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
// This code could be simplified by multiple calls to single routine with extra params to
// encapsulate the difference in the code paths.
int cb= (cbmask==0x2)? 1 : 0;
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (!offnode) {
// printf("local x %d sshift %d offnode %d rd %d cb %d\n",x,sshift,offnode,rd,cb);
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
// printf("nonlocal x %d sx %d sshift %d offnode %d rd %d cb %d cbmask %d rhscb %d comm_proc %d\n",
// x,sx,sshift,offnode,rd,cb,cbmask,rhs.checkerboard,comm_proc);
// Copy_plane(temp,rhs,dimension,x,sx,cbmask);
// Bug found; cbmask may differ between sx plan and rx plane.
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
// for(int i=0;i<MIN(words,8);i++){
// float *ptr = (float *)&send_buf[i];
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
// }
// Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask^0x3);
// for(int i=0;i<MIN(words,8);i++){
// float *ptr = (float *)&send_buf[i];
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
// }
// recv_buf=send_buf;
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
// printf("bytes %d node %d sending to %d receiving from %d\n",bytes,rank,xmit_to_rank,recv_from_rank );
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
}
}
}
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
const int Nsimd = vector_type::Nsimd();
SimdGrid *grid=rhs._grid;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=0;
for(int d=0;d<dimension;d++){
if (grid->_simd_layout[d]>1 ) permute_type++;
}
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
int bytes = buffer_size*words*sizeof(scalar_type);
std::vector<scalar_type *> pointers(Nsimd); //
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
// printf("cshift-comms-simd: shift = %d ; sshift = %d ; cbmask %d ; simd_layout %d\n",shift,sshift,cbmask,simd_layout);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
// Strategy
//
//* Loop over source planes
//* if any communication needed extract and send
//* if communication needed extract and send
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
// does shift to "neighbour" takes us off node?
// coordinates (reduce plane, simd_lane) of neighbour?
// how many nodes away is this shift?
// where we should send to?
// where we should receive from?
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
// printf("rd %d x %d shifted %d s=%d comm_any %d\n",rd, x,shifted_x,s,comm_any);
}
int o = 0;
int bo = x*grid->_ostride[dimension];
int sx = (x+sshift)%rd;
// Need Convenience function in _grid. Move this in
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
// for(int i=0;i<Nsimd;i++){
// printf("extracted %d %le\n",i,real(send_buf_extract[i][0]));
// }
for(int i=0;i<Nsimd;i++){
int s = grid->iCoordFromIsite(i,dimension);
if(comm_offnode[s]){
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
// printf("Cshift_simd comms %d %le %le\n",i,real(recv_buf_extract[i][0]),real(send_buf_extract[i][0]));
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
// printf("Cshift_simd local %d %le \n",i,real(send_buf_extract[i][0]));
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
for(int i=0;i<vobj::vector_type::Nsimd();i++){
if ( permute_slice ) {
pointers[i] = rpointers[permute_map[permute_type][i]];
} else {
pointers[i] = rpointers[i];
}
// printf("Cshift_simd perm %d num %d wrap %d swiz %d %le unswiz %le\n",permute_slice,num,wrap,i,real(pointers[i][0]),real(rpointers[i][0]));
}
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
} else {
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
}
}
#endif

View File

@ -1,313 +1,11 @@
#ifndef _GRID_NONE_CSHIFT_H_
#define _GRID_NONE_CSHIFT_H_
// Work out whether to permute
// ABCDEFGH -> AE BF CG DH permute wrap num
//
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
// Suppose 4way simd in one dim.
// ABCDEFGH -> AECG BFDH permute wrap num
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
// Generalisation to 8 way simd, 16 way simd required.
//
// Need log2 Nway masks. consisting of
// 1 bit 256 bit granule
// 2 bit 128 bit granule
// 4 bits 64 bit granule
// 8 bits 32 bit granules
//
// 15 bits....
// For optimisation:
//
// split into Cshift_none_rb_permute
// split into Cshift_none_rb_simple
//
// split into Cshift_none_permute
// split into Cshift_none_simple
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_ldimensions[dimension];
int gd = rhs._grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
// the permute type
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++){
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
}
for(int x=0;x<rd;x++){
int bo = x*rhs._grid->_ostride[dimension];
int o = 0;
if ( permute_dim ) {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) {
permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
} else {
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
}
o +=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
// This call in inner loop is annoying but necessary for dimension=0
// in the case of RedBlack grids. Could optimise away with
// alternate code paths for all other cases.
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
int sx = (x+sshift)%rd;
int so = sx*rhs._grid->_ostride[dimension];
ret._odata[bo+o+b]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];
}
}
}
return ret;
Cshift_local(ret,rhs,dimension,shift);
}
#if 0
// Collapse doesn't appear to work the way I think it should in icpc
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int sx,so,o;
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over perp slices.
// Threading considerations:
// Need to map thread_num to
//
// x_min,x_max for Loop-A
// n_min,n_max for Loop-B
// b_min,b_max for Loop-C
// In a way that maximally load balances.
//
// Optimal:
// There are rd*n_block*block items of work.
// These serialise as item "w"
// b=w%block; w=w/block
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
//
// Logic:
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
//
// rd=5 , threads = 8;
// 0 1 2 3 4 5 6 7
// 0 0 0 1 1 1 1 1
for(int x=0;x<rd;x++){ // Loop A
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension];
so =sx*rhs._grid->_ostride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
#if 0
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
vComplex *optr = (vComplex *)&ret._odata[o];
vComplex *iptr = (vComplex *)&rhs._odata[so];
for(int b=0;b<num;b++){
permute(optr[b],iptr[b],permute_type);
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
} else {
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
ret._odata[o+i]=rhs._odata[so+i];
}
o+=rhs._grid->_slice_stride[dimension];
so+=rhs._grid->_slice_stride[dimension];
}
}
#else
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
int num =rhs._grid->_slice_block[dimension]*internal;
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<num;b++){
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
permute(optr[b],iptr[b],permute_type);
}
}
} else {
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
int oo = o+ n*rhs._grid->_slice_stride[dimension];
int soo=so+ n*rhs._grid->_slice_stride[dimension];
ret._odata[oo+i]=rhs._odata[soo+i];
}
}
}
#endif
}
return ret;
}
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
int rd = rhs._grid->_rdimensions[dimension];
int ld = rhs._grid->_dimensions[dimension];
// Map to always positive shift.
shift = (shift+ld)%ld;
// Work out whether to permute and the permute type
// ABCDEFGH -> AE BF CG DH permute
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
int permute_dim =rhs._grid->_layout[dimension]>1 ;
int permute_type=0;
for(int d=0;d<dimension;d++)
if (rhs._grid->_layout[d]>1 ) permute_type++;
// loop over all work
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
#pragma omp parallel for
for(int ww=0;ww<work;ww++){
// can optimise this if know w moves congtiguously for a given thread.
// b=(b+1);
// if (b==_slice_block) {b=0; n=n+1;}
// if (n==_slice_nblock) { n=0; x=x+1}
//
// Perhaps a five cycle iterator, or so.
int w=ww;
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
int x = w;
int sx,so,o;
sx = (x-shift+ld)%rd;
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
int permute_slice=0;
if ( permute_dim ) {
permute_slice = shift/rd;
if ( x<shift%rd ) permute_slice = 1-permute_slice;
}
if ( permute_slice ) {
int internal=sizeof(vobj)/sizeof(vComplex);
vComplex *optr = (vComplex *)&ret._odata[o+b];
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
const char *pf = (const char *)iptr;
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
for(int i=0;i<internal;i++){
permute(optr[i],iptr[i],permute_type);
}
} else {
const char *pf = (const char *) &rhs._odata[so+b];
for(int i=0;i<sizeof(vobj);i+=64){
_mm_prefetch(pf+i,_MM_HINT_T0);
}
ret._odata[o+b]=rhs._odata[so+b];
}
}
return ret;
}
#endif
#endif

View File

@ -23,8 +23,20 @@ include_HEADERS = Grid.h\
# Test code
#
bin_PROGRAMS = Grid_main
extra_sources=
if BUILD_COMMS_MPI
extra_sources+=Grid_mpi.cc
endif
if BUILD_COMMS_FAKE
extra_sources+=Grid_fake.cc
endif
if BUILD_COMMS_NONE
extra_sources+=Grid_fake.cc
endif
Grid_main_SOURCES = \
Grid_main.cc\
Grid_fake.cc
$(extra_sources)
Grid_main_LDADD = libGrid.a

View File

@ -89,6 +89,9 @@ NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
bin_PROGRAMS = Grid_main$(EXEEXT)
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
subdir = .
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@ -143,7 +146,12 @@ libGrid_a_LIBADD =
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
PROGRAMS = $(bin_PROGRAMS)
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) Grid_fake.$(OBJEXT)
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
Grid_main_DEPENDENCIES = libGrid.a
AM_V_P = $(am__v_P_@AM_V@)
@ -176,7 +184,7 @@ am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
am__v_CXXLD_0 = @echo " CXXLD " $@;
am__v_CXXLD_1 =
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
DIST_SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
@ -340,9 +348,10 @@ include_HEADERS = Grid.h\
Grid_Lattice.h\
Grid_config.h
extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
Grid_main_SOURCES = \
Grid_main.cc\
Grid_fake.cc
$(extra_sources)
Grid_main_LDADD = libGrid.a
all: Grid_config.h
@ -490,6 +499,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@
.cc.o:
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<

705
aclocal.m4 vendored

File diff suppressed because it is too large Load Diff

43
configure vendored
View File

@ -626,6 +626,12 @@ ac_subst_vars='am__EXEEXT_FALSE
am__EXEEXT_TRUE
LTLIBOBJS
LIBOBJS
BUILD_COMMS_NONE_FALSE
BUILD_COMMS_NONE_TRUE
BUILD_COMMS_FAKE_FALSE
BUILD_COMMS_FAKE_TRUE
BUILD_COMMS_MPI_FALSE
BUILD_COMMS_MPI_TRUE
EGREP
GREP
CPP
@ -5068,6 +5074,31 @@ $as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
;;
esac
if test "X${ac_COMMS}X" == "XmpiX" ; then
BUILD_COMMS_MPI_TRUE=
BUILD_COMMS_MPI_FALSE='#'
else
BUILD_COMMS_MPI_TRUE='#'
BUILD_COMMS_MPI_FALSE=
fi
if test "X${ac_COMMS}X" == "XfakeX" ; then
BUILD_COMMS_FAKE_TRUE=
BUILD_COMMS_FAKE_FALSE='#'
else
BUILD_COMMS_FAKE_TRUE='#'
BUILD_COMMS_FAKE_FALSE=
fi
if test "X${ac_COMMS}X" == "XnoneX" ; then
BUILD_COMMS_NONE_TRUE=
BUILD_COMMS_NONE_FALSE='#'
else
BUILD_COMMS_NONE_TRUE='#'
BUILD_COMMS_NONE_FALSE=
fi
ac_config_files="$ac_config_files Makefile"
@ -5208,6 +5239,18 @@ if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0

View File

@ -71,6 +71,10 @@ case ${ac_COMMS} in
;;
esac
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
AC_CONFIG_FILES(Makefile)
AC_OUTPUT