mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
MPI is now working and passing basic tests. Will start to construct a more sensible test suite shortly
since testing requirements now go beyond what a single Grid_main.cc can do. Will need a more organised src tree for this and will require substantial reorg of build system.
This commit is contained in:
parent
1a9d4d3655
commit
9ba89e64dc
2
Grid.h
2
Grid.h
@ -52,7 +52,7 @@
|
||||
|
||||
namespace dpo {
|
||||
|
||||
void Grid_init(void);
|
||||
void Grid_init(int *argc,char ***argv);
|
||||
double usecond(void);
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_debug_handler_init(void);
|
||||
|
@ -41,8 +41,8 @@ public:
|
||||
std::vector<int> _simd_layout; // Which dimensions get relayed out over simd lanes.
|
||||
|
||||
|
||||
std::vector<int> _fdimensions;// Global dimensions of array with cb removed
|
||||
std::vector<int> _gdimensions;// Global dimensions of array
|
||||
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
|
||||
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
|
||||
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
|
||||
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
|
||||
|
||||
@ -88,11 +88,20 @@ public:
|
||||
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
|
||||
return idx;
|
||||
}
|
||||
inline int iCoordFromIsite(int lane,int mu)
|
||||
{
|
||||
std::vector<int> coor(_ndimension);
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
coor[d] = lane % _simd_layout[d];
|
||||
lane = lane / _simd_layout[d];
|
||||
}
|
||||
return coor[mu];
|
||||
}
|
||||
|
||||
inline int oSites(void) { return _osites; };
|
||||
inline int iSites(void) { return _isites; };
|
||||
|
||||
inline int CheckerboardFromOsite (int Osite){
|
||||
inline int CheckerBoardFromOsite (int Osite){
|
||||
std::vector<int> ocoor;
|
||||
CoordFromOsite(ocoor,Osite);
|
||||
int ss=0;
|
||||
@ -109,6 +118,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual int CheckerBoarded(int dim)=0;
|
||||
virtual int CheckerBoard(std::vector<int> site)=0;
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift)=0;
|
||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||
@ -116,6 +126,9 @@ public:
|
||||
|
||||
class GridCartesian: public SimdGrid {
|
||||
public:
|
||||
virtual int CheckerBoarded(int dim){
|
||||
return 0;
|
||||
}
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return 0;
|
||||
}
|
||||
@ -199,6 +212,10 @@ public:
|
||||
class GridRedBlackCartesian : public SimdGrid
|
||||
{
|
||||
public:
|
||||
virtual int CheckerBoarded(int dim){
|
||||
if( dim==0) return 1;
|
||||
else return 0;
|
||||
}
|
||||
virtual int CheckerBoard(std::vector<int> site){
|
||||
return (site[0]+site[1]+site[2]+site[3])&0x1;
|
||||
}
|
||||
@ -215,11 +232,13 @@ public:
|
||||
|
||||
// Probably faster with table lookup;
|
||||
// or by looping over x,y,z and multiply rather than computing checkerboard.
|
||||
int ocb=CheckerboardFromOsite(osite);
|
||||
int ocb=CheckerBoardFromOsite(osite);
|
||||
|
||||
if ( (source_cb+ocb)&1 ) {
|
||||
printf("Checkerboard shift %d\n",(shift)/2);
|
||||
return (shift)/2;
|
||||
} else {
|
||||
printf("Checkerboard shift %d\n",(shift+1)/2);
|
||||
return (shift+1)/2;
|
||||
}
|
||||
}
|
||||
|
@ -3,12 +3,16 @@
|
||||
///////////////////////////////////
|
||||
// Processor layout information
|
||||
///////////////////////////////////
|
||||
#ifdef GRID_COMMS_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
namespace dpo {
|
||||
class CartesianCommunicator {
|
||||
public:
|
||||
|
||||
// Communicator should know nothing of the physics grid, only processor grid.
|
||||
|
||||
int _Nprocessors; // How many in all
|
||||
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||
int _processor; // linear processor rank
|
||||
std::vector<int> _processor_coor; // linear processor coordinate
|
||||
@ -20,7 +24,9 @@ class CartesianCommunicator {
|
||||
|
||||
CartesianCommunicator(std::vector<int> &pdimensions_in);
|
||||
|
||||
void ShiftedRanks(int dim,int shift,int & source, int & dest);
|
||||
int Rank(std::vector<int> coor);
|
||||
int MyRank(void);
|
||||
void GlobalSumF(float &);
|
||||
void GlobalSumFVector(float *,int N);
|
||||
|
||||
@ -28,9 +34,9 @@ class CartesianCommunicator {
|
||||
void GlobalSumFVector(double *,int N);
|
||||
|
||||
void SendToRecvFrom(void *xmit,
|
||||
std::vector<int> to_coordinate,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
std::vector<int> from_coordinate,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
};
|
||||
|
@ -23,7 +23,8 @@ public:
|
||||
SimdGrid *_grid;
|
||||
int checkerboard;
|
||||
std::vector<vobj,alignedAllocator<vobj> > _odata;
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
public:
|
||||
|
||||
|
||||
@ -36,6 +37,8 @@ public:
|
||||
}
|
||||
|
||||
|
||||
#include <Grid_cshift_common.h>
|
||||
|
||||
#ifdef GRID_COMMS_NONE
|
||||
#include <Grid_none_cshift.h>
|
||||
#endif
|
||||
|
@ -11,10 +11,10 @@
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_FAKE */
|
||||
#define GRID_COMMS_FAKE 1
|
||||
/* #undef GRID_COMMS_FAKE */
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
/* #undef GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
|
@ -9,6 +9,15 @@
|
||||
/* AVX512 */
|
||||
#undef AVX512
|
||||
|
||||
/* GRID_COMMS_FAKE */
|
||||
#undef GRID_COMMS_FAKE
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#undef GRID_COMMS_MPI
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
#undef GRID_COMMS_NONE
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#undef HAVE_GETTIMEOFDAY
|
||||
|
||||
@ -60,6 +69,9 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
|
384
Grid_cshift_common.h
Normal file
384
Grid_cshift_common.h
Normal file
@ -0,0 +1,384 @@
|
||||
#ifndef _GRID_CSHIFT_COMMON_H_
|
||||
#define _GRID_CSHIFT_COMMON_H_
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
// printf("Gather plane _simple mask %d\n",cbmask);
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
buffer[bo++]=rhs._odata[so+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// int jjj=0;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo]=rhs._odata[so+o+b];
|
||||
// float * ptr = (float *)& rhs._odata[so+o+b];
|
||||
// if( (cbmask!=3)&&(jjj<8)){
|
||||
// printf("Gather_plane_simple %d %le bo %d\n",so+o+b,*ptr,bo);
|
||||
// jjj++;
|
||||
// }
|
||||
bo++;
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
|
||||
if ( ocb & cbmask ) {
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there *is* need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
merge(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
merge(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
// if lhs is odd, rhs even??
|
||||
friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
// int jjj=0;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
// float *ptr =(float *) &rhs._odata[ro+o+b];
|
||||
// if((cbmask!=0x3)&&jjj<8) {
|
||||
// printf("Copy_plane %d %le n,b=%d,%d mask %d ocb %d\n",ro+o+b,*ptr,n,b,cbmask,ocb);
|
||||
// jjj++;
|
||||
// }
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Local to node Cshift
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
// Work out whether to permute
|
||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||
//
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||
|
||||
// Suppose 4way simd in one dim.
|
||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||
|
||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||
|
||||
// Generalisation to 8 way simd, 16 way simd required.
|
||||
//
|
||||
// Need log2 Nway masks. consisting of
|
||||
// 1 bit 256 bit granule
|
||||
// 2 bit 128 bit granule
|
||||
// 4 bits 64 bit granule
|
||||
// 8 bits 32 bit granules
|
||||
//
|
||||
// 15 bits....
|
||||
|
||||
friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Cshift_local(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_ldimensions[dimension];
|
||||
int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * rhs._grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
@ -16,8 +16,11 @@
|
||||
#undef __X86_64
|
||||
namespace dpo {
|
||||
|
||||
void Grid_init(void)
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Init(argc,argv);
|
||||
#endif
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
double usecond(void) {
|
||||
|
46
Grid_main.cc
46
Grid_main.cc
@ -13,34 +13,28 @@ using namespace dpo::QCD;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
#if 0
|
||||
struct sigaction sa,osa;
|
||||
sigemptyset (&sa.sa_mask);
|
||||
sa.sa_sigaction= sa_action;
|
||||
sa.sa_flags = SA_ONSTACK|SA_SIGINFO;
|
||||
|
||||
sigaction(SIGSEGV,&sa,NULL);
|
||||
sigaction(SIGTRAP,&sa,NULL);
|
||||
#endif
|
||||
|
||||
std::vector<int> latt_size(4);
|
||||
std::vector<int> simd_layout(4);
|
||||
|
||||
std::vector<int> mpi_layout(4);
|
||||
for(int d=0;d<4;d++) mpi_layout[d]=1;
|
||||
std::vector<int> latt_size(4);
|
||||
std::vector<int> simd_layout(4);
|
||||
|
||||
std::vector<int> mpi_layout(4);
|
||||
mpi_layout[0]=4;
|
||||
mpi_layout[1]=2;
|
||||
mpi_layout[2]=4;
|
||||
mpi_layout[3]=2;
|
||||
|
||||
#ifdef AVX512
|
||||
for(int omp=128;omp<236;omp+=16){
|
||||
#else
|
||||
for(int omp=1;omp<8;omp*=2){
|
||||
for(int omp=1;omp<8;omp*=20){
|
||||
#endif
|
||||
|
||||
#ifdef OMP
|
||||
omp_set_num_threads(omp);
|
||||
#endif
|
||||
|
||||
for(int lat=4;lat<=16;lat+=40){
|
||||
for(int lat=16;lat<=16;lat+=40){
|
||||
latt_size[0] = lat;
|
||||
latt_size[1] = lat;
|
||||
latt_size[2] = lat;
|
||||
@ -234,9 +228,9 @@ int main (int argc, char ** argv)
|
||||
std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
|
||||
Shifted = Cshift(Foo,dir,shift); // Shift everything
|
||||
|
||||
std::cout << "Shifting even parities"<<std::endl;
|
||||
std::cout << "Shifting even source parities to odd result"<<std::endl;
|
||||
bShifted = Cshift(rFoo,dir,shift); // Shift red->black
|
||||
std::cout << "Shifting odd parities"<<std::endl;
|
||||
std::cout << "Shifting odd parities to even result"<<std::endl;
|
||||
rShifted = Cshift(bFoo,dir,shift); // Shift black->red
|
||||
|
||||
ShiftedCheck=zero;
|
||||
@ -245,19 +239,19 @@ int main (int argc, char ** argv)
|
||||
|
||||
// Check results
|
||||
std::vector<int> coor(4);
|
||||
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
|
||||
|
||||
std::complex<dpo::Real> diff;
|
||||
|
||||
std::vector<int> shiftcoor = coor;
|
||||
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
|
||||
shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%(latt_size[dir]/mpi_layout[dir]);
|
||||
|
||||
std::vector<int> rl(4);
|
||||
for(int dd=0;dd<4;dd++){
|
||||
rl[dd] = latt_size[dd]/simd_layout[dd];
|
||||
rl[dd] = latt_size[dd]/simd_layout[dd]/mpi_layout[dd];
|
||||
}
|
||||
int lex = coor[0]%rl[0]
|
||||
+ (coor[1]%rl[1])*rl[0]
|
||||
@ -344,4 +338,6 @@ int main (int argc, char ** argv)
|
||||
} // loop for lat
|
||||
} // loop for omp
|
||||
|
||||
MPI_Finalize();
|
||||
|
||||
}
|
||||
|
46
Grid_mpi.cc
46
Grid_mpi.cc
@ -7,16 +7,23 @@ namespace dpo {
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = _processors.size();
|
||||
_ndimension = processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
_processor_coords.resize(_ndimension);
|
||||
|
||||
MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
|
||||
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
printf("Hello world from processor [");
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
printf("%d ",_processor_coor[i]);
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
printf("]\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSumF(float &f){
|
||||
@ -35,11 +42,9 @@ void CartesianCommunicator::GlobalSumFVector(double *d,int N)
|
||||
MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
}
|
||||
|
||||
int CartesianCommunicator::ShiftedRank(int dim,int shift)
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
int rank;
|
||||
MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
|
||||
return rank;
|
||||
MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
}
|
||||
int CartesianCommunicator::Rank(std::vector<int> coor)
|
||||
{
|
||||
@ -50,23 +55,18 @@ int CartesianCommunicator::Rank(std::vector<int> coor)
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
std::vector<int> &to_coordinate,
|
||||
int dest,
|
||||
void *recv,
|
||||
std::vector<int> &from_coordinate,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
int dest = Rank(to_coordinate);
|
||||
int from = Rank(from_coordinate);
|
||||
MPI_Request reqs[2];
|
||||
MPI_Status OkeyDokey[2];
|
||||
int rank = _processor;
|
||||
MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
|
||||
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
|
||||
MPI_Waitall(2,reqs,OkeyDokey);
|
||||
|
||||
MPI_Request x_req;
|
||||
MPI_Request r_req;
|
||||
MPI_Status OkeyDokey;
|
||||
|
||||
MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
|
||||
MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
|
||||
MPI_Wait(&x_req,&OkeyDokey);
|
||||
MPI_Wait(&r_req,&OkeyDokey);
|
||||
MPI_Barrier();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,26 +1,9 @@
|
||||
#ifndef _GRID_MPI_CSHIFT_H_
|
||||
#define _GRID_MPI_CSHIFT_H_
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Q. Split this into seperate sub functions?
|
||||
//////////////////////////////////////////////
|
||||
|
||||
// CshiftCB_comms_splice
|
||||
// CshiftCB_comms
|
||||
// CshiftCB_local
|
||||
// CshiftCB_local_permute
|
||||
|
||||
// Cshift_comms_splice
|
||||
// Cshift_comms
|
||||
// Cshift_local
|
||||
// Cshift_local_permute
|
||||
|
||||
// Broadly I remain annoyed that the iteration is so painful
|
||||
// for red black data layout, when simple block strided descriptors suffice for non-cb.
|
||||
//
|
||||
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
|
||||
//
|
||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||
// additional stencil support.
|
||||
@ -33,79 +16,31 @@
|
||||
//
|
||||
// Grid could create a neighbour index table for a given stencil.
|
||||
// Could also implement CovariantCshift.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Non checkerboarded support functions
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Q. Further split this into separate sub functions?
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)
|
||||
{
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
// CshiftCB_local
|
||||
// CshiftCB_local_permute
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Checkerboarded support functions
|
||||
//////////////////////////////////////////////////////
|
||||
|
||||
//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
|
||||
//
|
||||
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
|
||||
// Cshift_comms_splice
|
||||
// Cshift_comms
|
||||
// Cshift_local
|
||||
// Cshift_local_permute
|
||||
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
//int ld = rhs._grid->_ldimensions[dimension];
|
||||
//int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
@ -115,270 +50,288 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
// the permute type
|
||||
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||
int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
|
||||
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
// Logic for non-distributed dimension
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_to (simd_layout);
|
||||
std::vector<int> comm_from (simd_layout);
|
||||
std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane
|
||||
std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Move via a fake comms buffer
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
|
||||
std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
|
||||
|
||||
|
||||
if ( permute_dim ) {
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
int o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[ro+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
if ( !comm_dim ) {
|
||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||
} else if ( splice_dim ) {
|
||||
|
||||
if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
|
||||
if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
|
||||
|
||||
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
|
||||
}
|
||||
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Extract one orthogonal slice at a time
|
||||
///////////////////////////////////////////
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
|
||||
o = 0; // relative offset to base
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
extract(rhs._odata[so+o+b],pointers);
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
|
||||
|
||||
// shift to "neighbour" takes us off node
|
||||
// coordinates (rx, simd_lane) of neighbour
|
||||
// how many nodes away is this shift
|
||||
// where we should send to
|
||||
// where we should receive from
|
||||
int shifted_x = x+s*rd+shift;
|
||||
comm_offnode[s] = shifted_x > ld;
|
||||
comm_send_rx[s] = shifted_x%rd; // which slice geton the other node
|
||||
comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node
|
||||
comm_from[s] = shifted_x/ld;
|
||||
comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
|
||||
comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];
|
||||
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Insert communication phase
|
||||
////////////////////////////////////////////////
|
||||
#if 0
|
||||
} else if (comm_dim ) {
|
||||
|
||||
// Packed gather sequence is clean
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
// off node; communcate slice (ld==rd)
|
||||
if ( x+shift > rd ) {
|
||||
int sb=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
send_buf[sb++]=rhs._odata[so+i];
|
||||
}
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
// Make a comm_fake them mimics comms in periodic case.
|
||||
|
||||
// scatter face
|
||||
int rb=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[so+i]=recv_buf[rb++];
|
||||
}
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=rhs._odata[so+i];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Pull receive buffers and permuted buffers in
|
||||
////////////////////////////////////////////////
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
|
||||
}
|
||||
|
||||
o = 0; // relative offset to base
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// base offset for source
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
merge(ret._odata[ro+o+b],pointers);
|
||||
}
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else if ( comm_dim ) {
|
||||
|
||||
|
||||
int co; // comm offset
|
||||
int o;
|
||||
|
||||
co=0;
|
||||
for(int x=0;x<rd;x++){
|
||||
o=0;
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
comm_buf[co++]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
// Step through a copy into a comms buffer and pull back in.
|
||||
// Genuine fake implementation could calculate if loops back
|
||||
co=0; o=0;
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
ret._odata[ro+o+b]=comm_buf[co++];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Local dimension, no permute required
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
int o=0;
|
||||
int ro = x*rhs._grid->_ostride[dimension]; // base offset for result
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||
} else {
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
// printf("Cshift_comms : single pass\n");
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
// printf("Cshift_comms : two pass\n");
|
||||
// printf("call1\n");
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
// printf("call2\n");
|
||||
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
// printf("done\n");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
int sshift[2];
|
||||
|
||||
sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
|
||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
// printf("call1 0x1 cb=even\n");
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
// printf("call2 0x2 cb=odd\n");
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
// printf("done\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
SimdGrid *grid=rhs._grid;
|
||||
Lattice<vobj> temp(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int simd_layout = rhs._grid->_simd_layout[dimension];
|
||||
int comm_dim = rhs._grid->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
// Packed gather sequence is clean
|
||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
// This code could be simplified by multiple calls to single routine with extra params to
|
||||
// encapsulate the difference in the code paths.
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (!offnode) {
|
||||
// printf("local x %d sshift %d offnode %d rd %d cb %d\n",x,sshift,offnode,rd,cb);
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
} else {
|
||||
|
||||
int words = send_buf.size();
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
// printf("nonlocal x %d sx %d sshift %d offnode %d rd %d cb %d cbmask %d rhscb %d comm_proc %d\n",
|
||||
// x,sx,sshift,offnode,rd,cb,cbmask,rhs.checkerboard,comm_proc);
|
||||
// Copy_plane(temp,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
// Bug found; cbmask may differ between sx plan and rx plane.
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
// for(int i=0;i<MIN(words,8);i++){
|
||||
// float *ptr = (float *)&send_buf[i];
|
||||
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
|
||||
// }
|
||||
// Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask^0x3);
|
||||
// for(int i=0;i<MIN(words,8);i++){
|
||||
// float *ptr = (float *)&send_buf[i];
|
||||
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
|
||||
// }
|
||||
// recv_buf=send_buf;
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
// printf("bytes %d node %d sending to %d receiving from %d\n",bytes,rank,xmit_to_rank,recv_from_rank );
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
const int Nsimd = vector_type::Nsimd();
|
||||
SimdGrid *grid=rhs._grid;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd); //
|
||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? 1 : 0;
|
||||
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
// printf("cshift-comms-simd: shift = %d ; sshift = %d ; cbmask %d ; simd_layout %d\n",shift,sshift,cbmask,simd_layout);
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||
|
||||
// Strategy
|
||||
//
|
||||
//* Loop over source planes
|
||||
//* if any communication needed extract and send
|
||||
//* if communication needed extract and send
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int comm_any = 0;
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
// does shift to "neighbour" takes us off node?
|
||||
// coordinates (reduce plane, simd_lane) of neighbour?
|
||||
// how many nodes away is this shift?
|
||||
// where we should send to?
|
||||
// where we should receive from?
|
||||
int shifted_x = x+s*rd+sshift;
|
||||
comm_offnode[s] = shifted_x >= ld;
|
||||
comm_any = comm_any | comm_offnode[s];
|
||||
comm_proc[s] = shifted_x/ld;
|
||||
// printf("rd %d x %d shifted %d s=%d comm_any %d\n",rd, x,shifted_x,s,comm_any);
|
||||
}
|
||||
|
||||
int o = 0;
|
||||
int bo = x*grid->_ostride[dimension];
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
// Need Convenience function in _grid. Move this in
|
||||
if ( comm_any ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
}
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
// for(int i=0;i<Nsimd;i++){
|
||||
// printf("extracted %d %le\n",i,real(send_buf_extract[i][0]));
|
||||
// }
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int s = grid->iCoordFromIsite(i,dimension);
|
||||
|
||||
if(comm_offnode[s]){
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
// printf("Cshift_simd comms %d %le %le\n",i,real(recv_buf_extract[i][0]),real(send_buf_extract[i][0]));
|
||||
|
||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
// printf("Cshift_simd local %d %le \n",i,real(send_buf_extract[i][0]));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Permute by swizzling pointers in merge
|
||||
int permute_slice=0;
|
||||
int lshift=sshift%ld;
|
||||
int wrap =lshift/rd;
|
||||
int num =lshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
for(int i=0;i<vobj::vector_type::Nsimd();i++){
|
||||
if ( permute_slice ) {
|
||||
pointers[i] = rpointers[permute_map[permute_type][i]];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
// printf("Cshift_simd perm %d num %d wrap %d swiz %d %le unswiz %le\n",permute_slice,num,wrap,i,real(pointers[i][0]),real(rpointers[i][0]));
|
||||
}
|
||||
|
||||
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
|
||||
|
||||
} else {
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -1,313 +1,11 @@
|
||||
#ifndef _GRID_NONE_CSHIFT_H_
|
||||
#define _GRID_NONE_CSHIFT_H_
|
||||
// Work out whether to permute
|
||||
// ABCDEFGH -> AE BF CG DH permute wrap num
|
||||
//
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0
|
||||
// Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1
|
||||
// Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2
|
||||
// Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0
|
||||
// Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1
|
||||
// Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2
|
||||
// Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3
|
||||
|
||||
// Suppose 4way simd in one dim.
|
||||
// ABCDEFGH -> AECG BFDH permute wrap num
|
||||
|
||||
// Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0
|
||||
// Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1
|
||||
// Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0
|
||||
// Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1
|
||||
// Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0
|
||||
// Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1
|
||||
// Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0
|
||||
// Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1
|
||||
|
||||
// Generalisation to 8 way simd, 16 way simd required.
|
||||
//
|
||||
// Need log2 Nway masks. consisting of
|
||||
// 1 bit 256 bit granule
|
||||
// 2 bit 128 bit granule
|
||||
// 4 bits 64 bit granule
|
||||
// 8 bits 32 bit granules
|
||||
//
|
||||
// 15 bits....
|
||||
|
||||
// For optimisation:
|
||||
//
|
||||
// split into Cshift_none_rb_permute
|
||||
// split into Cshift_none_rb_simple
|
||||
//
|
||||
// split into Cshift_none_permute
|
||||
// split into Cshift_none_simple
|
||||
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
int fd = rhs._grid->_fdimensions[dimension];
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_ldimensions[dimension];
|
||||
int gd = rhs._grid->_gdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
|
||||
// the permute type
|
||||
|
||||
int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++){
|
||||
if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
|
||||
}
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int bo = x*rhs._grid->_ostride[dimension];
|
||||
int o = 0;
|
||||
|
||||
if ( permute_dim ) {
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) {
|
||||
permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
|
||||
} else {
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
}
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
|
||||
}
|
||||
} else {
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
// This call in inner loop is annoying but necessary for dimension=0
|
||||
// in the case of RedBlack grids. Could optimise away with
|
||||
// alternate code paths for all other cases.
|
||||
int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int so = sx*rhs._grid->_ostride[dimension];
|
||||
ret._odata[bo+o+b]=rhs._odata[so+o+b];
|
||||
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
Cshift_local(ret,rhs,dimension,shift);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Collapse doesn't appear to work the way I think it should in icpc
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int sx,so,o;
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
// loop over perp slices.
|
||||
// Threading considerations:
|
||||
// Need to map thread_num to
|
||||
//
|
||||
// x_min,x_max for Loop-A
|
||||
// n_min,n_max for Loop-B
|
||||
// b_min,b_max for Loop-C
|
||||
// In a way that maximally load balances.
|
||||
//
|
||||
// Optimal:
|
||||
// There are rd*n_block*block items of work.
|
||||
// These serialise as item "w"
|
||||
// b=w%block; w=w/block
|
||||
// n=w%nblock; x=w/nblock. Perhaps 20 cycles?
|
||||
//
|
||||
// Logic:
|
||||
// x_chunk = (rd+thread)/nthreads simply divide work across nodes.
|
||||
//
|
||||
// rd=5 , threads = 8;
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// 0 0 0 1 1 1 1 1
|
||||
for(int x=0;x<rd;x++){ // Loop A
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension];
|
||||
so =sx*rhs._grid->_ostride[dimension];
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
#if 0
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so];
|
||||
for(int b=0;b<num;b++){
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
} else {
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
ret._odata[o+i]=rhs._odata[so+i];
|
||||
}
|
||||
o+=rhs._grid->_slice_stride[dimension];
|
||||
so+=rhs._grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
if ( permute_slice ) {
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
int num =rhs._grid->_slice_block[dimension]*internal;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<num;b++){
|
||||
vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
|
||||
permute(optr[b],iptr[b],permute_type);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
|
||||
int oo = o+ n*rhs._grid->_slice_stride[dimension];
|
||||
int soo=so+ n*rhs._grid->_slice_stride[dimension];
|
||||
ret._odata[oo+i]=rhs._odata[soo+i];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
Lattice<vobj> ret(rhs._grid);
|
||||
|
||||
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
|
||||
shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
int ld = rhs._grid->_dimensions[dimension];
|
||||
|
||||
// Map to always positive shift.
|
||||
shift = (shift+ld)%ld;
|
||||
|
||||
// Work out whether to permute and the permute type
|
||||
// ABCDEFGH -> AE BF CG DH permute
|
||||
// Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH
|
||||
// Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG
|
||||
// Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF
|
||||
// Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE
|
||||
// Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD
|
||||
// Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC
|
||||
// Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB
|
||||
// Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA
|
||||
int permute_dim =rhs._grid->_layout[dimension]>1 ;
|
||||
int permute_type=0;
|
||||
for(int d=0;d<dimension;d++)
|
||||
if (rhs._grid->_layout[d]>1 ) permute_type++;
|
||||
|
||||
|
||||
// loop over all work
|
||||
int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ww=0;ww<work;ww++){
|
||||
|
||||
|
||||
// can optimise this if know w moves congtiguously for a given thread.
|
||||
// b=(b+1);
|
||||
// if (b==_slice_block) {b=0; n=n+1;}
|
||||
// if (n==_slice_nblock) { n=0; x=x+1}
|
||||
//
|
||||
// Perhaps a five cycle iterator, or so.
|
||||
int w=ww;
|
||||
int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
|
||||
int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
|
||||
int x = w;
|
||||
|
||||
int sx,so,o;
|
||||
sx = (x-shift+ld)%rd;
|
||||
o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
|
||||
so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
|
||||
|
||||
int permute_slice=0;
|
||||
if ( permute_dim ) {
|
||||
permute_slice = shift/rd;
|
||||
if ( x<shift%rd ) permute_slice = 1-permute_slice;
|
||||
}
|
||||
|
||||
if ( permute_slice ) {
|
||||
|
||||
int internal=sizeof(vobj)/sizeof(vComplex);
|
||||
vComplex *optr = (vComplex *)&ret._odata[o+b];
|
||||
vComplex *iptr = (vComplex *)&rhs._odata[so+b];
|
||||
const char *pf = (const char *)iptr;
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
|
||||
for(int i=0;i<internal;i++){
|
||||
permute(optr[i],iptr[i],permute_type);
|
||||
}
|
||||
} else {
|
||||
const char *pf = (const char *) &rhs._odata[so+b];
|
||||
for(int i=0;i<sizeof(vobj);i+=64){
|
||||
_mm_prefetch(pf+i,_MM_HINT_T0);
|
||||
}
|
||||
ret._odata[o+b]=rhs._odata[so+b];
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
14
Makefile.am
14
Makefile.am
@ -23,8 +23,20 @@ include_HEADERS = Grid.h\
|
||||
# Test code
|
||||
#
|
||||
bin_PROGRAMS = Grid_main
|
||||
|
||||
extra_sources=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=Grid_mpi.cc
|
||||
endif
|
||||
if BUILD_COMMS_FAKE
|
||||
extra_sources+=Grid_fake.cc
|
||||
endif
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=Grid_fake.cc
|
||||
endif
|
||||
|
||||
Grid_main_SOURCES = \
|
||||
Grid_main.cc\
|
||||
Grid_fake.cc
|
||||
$(extra_sources)
|
||||
|
||||
Grid_main_LDADD = libGrid.a
|
||||
|
16
Makefile.in
16
Makefile.in
@ -89,6 +89,9 @@ NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
bin_PROGRAMS = Grid_main$(EXEEXT)
|
||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
|
||||
@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
|
||||
@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
|
||||
subdir = .
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
||||
@ -143,7 +146,12 @@ libGrid_a_LIBADD =
|
||||
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
|
||||
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
|
||||
PROGRAMS = $(bin_PROGRAMS)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) Grid_fake.$(OBJEXT)
|
||||
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
|
||||
@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
|
||||
@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
|
||||
am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
|
||||
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
|
||||
Grid_main_DEPENDENCIES = libGrid.a
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
@ -176,7 +184,7 @@ am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
|
||||
am__v_CXXLD_0 = @echo " CXXLD " $@;
|
||||
am__v_CXXLD_1 =
|
||||
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
|
||||
DIST_SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
|
||||
DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
@ -340,9 +348,10 @@ include_HEADERS = Grid.h\
|
||||
Grid_Lattice.h\
|
||||
Grid_config.h
|
||||
|
||||
extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
|
||||
Grid_main_SOURCES = \
|
||||
Grid_main.cc\
|
||||
Grid_fake.cc
|
||||
$(extra_sources)
|
||||
|
||||
Grid_main_LDADD = libGrid.a
|
||||
all: Grid_config.h
|
||||
@ -490,6 +499,7 @@ distclean-compile:
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@
|
||||
|
||||
.cc.o:
|
||||
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
|
705
aclocal.m4
vendored
705
aclocal.m4
vendored
File diff suppressed because it is too large
Load Diff
43
configure
vendored
43
configure
vendored
@ -626,6 +626,12 @@ ac_subst_vars='am__EXEEXT_FALSE
|
||||
am__EXEEXT_TRUE
|
||||
LTLIBOBJS
|
||||
LIBOBJS
|
||||
BUILD_COMMS_NONE_FALSE
|
||||
BUILD_COMMS_NONE_TRUE
|
||||
BUILD_COMMS_FAKE_FALSE
|
||||
BUILD_COMMS_FAKE_TRUE
|
||||
BUILD_COMMS_MPI_FALSE
|
||||
BUILD_COMMS_MPI_TRUE
|
||||
EGREP
|
||||
GREP
|
||||
CPP
|
||||
@ -5068,6 +5074,31 @@ $as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
|
||||
;;
|
||||
esac
|
||||
|
||||
if test "X${ac_COMMS}X" == "XmpiX" ; then
|
||||
BUILD_COMMS_MPI_TRUE=
|
||||
BUILD_COMMS_MPI_FALSE='#'
|
||||
else
|
||||
BUILD_COMMS_MPI_TRUE='#'
|
||||
BUILD_COMMS_MPI_FALSE=
|
||||
fi
|
||||
|
||||
if test "X${ac_COMMS}X" == "XfakeX" ; then
|
||||
BUILD_COMMS_FAKE_TRUE=
|
||||
BUILD_COMMS_FAKE_FALSE='#'
|
||||
else
|
||||
BUILD_COMMS_FAKE_TRUE='#'
|
||||
BUILD_COMMS_FAKE_FALSE=
|
||||
fi
|
||||
|
||||
if test "X${ac_COMMS}X" == "XnoneX" ; then
|
||||
BUILD_COMMS_NONE_TRUE=
|
||||
BUILD_COMMS_NONE_FALSE='#'
|
||||
else
|
||||
BUILD_COMMS_NONE_TRUE='#'
|
||||
BUILD_COMMS_NONE_FALSE=
|
||||
fi
|
||||
|
||||
|
||||
|
||||
ac_config_files="$ac_config_files Makefile"
|
||||
|
||||
@ -5208,6 +5239,18 @@ if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
|
||||
as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
|
||||
as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
|
||||
: "${CONFIG_STATUS=./config.status}"
|
||||
ac_write_fail=0
|
||||
|
@ -71,6 +71,10 @@ case ${ac_COMMS} in
|
||||
;;
|
||||
esac
|
||||
|
||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
||||
AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
|
||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||
|
||||
|
||||
AC_CONFIG_FILES(Makefile)
|
||||
AC_OUTPUT
|
||||
|
Loading…
Reference in New Issue
Block a user