From 1eee664092b1d981770532a5917e364a5efebc6a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Apr 2015 20:22:04 +0100 Subject: [PATCH] Stencil code pretty much shaken out. Beginning of inner product and norm2. --- Grid.h | 1 + Grid_Lattice.h | 12 + Grid_config.h | 4 +- Grid_cshift_common.h | 2 + Grid_main.cc | 15 +- Grid_math_types.h | 3 +- Grid_simd.h | 53 +++- Grid_stencil.h | 572 ++++++++++++++++++++--------------------- Grid_stencil_common.cc | 258 +++++++++++++++++++ Grid_vComplexD.h | 7 +- Grid_vComplexF.h | 38 +-- Makefile.am | 33 ++- Makefile.in | 53 ++-- TODO | 2 +- test_Grid_jacobi.cc | 206 +++++++++++++++ test_Grid_stencil.cc | 154 +++++++++++ 16 files changed, 1050 insertions(+), 363 deletions(-) create mode 100644 Grid_stencil_common.cc create mode 100644 test_Grid_jacobi.cc create mode 100644 test_Grid_stencil.cc diff --git a/Grid.h b/Grid.h index a0989e0b..2a1072ea 100644 --- a/Grid.h +++ b/Grid.h @@ -47,6 +47,7 @@ #include #include #include +#include #include namespace Grid { diff --git a/Grid_Lattice.h b/Grid_Lattice.h index 119d6181..1e2c820e 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -407,6 +407,18 @@ public: return ret; } + template + inline RealD norm2(const Lattice &arg){ + typedef typename vobj::scalar_type scalar; + decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero; + + scalar nrm; + for(int ss=0;ssoSites(); ss++){ + vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]); + } + nrm = Reduce(TensorRemove(vnrm)); + return real(nrm); + } } #endif diff --git a/Grid_config.h b/Grid_config.h index deb6c74c..bb885708 100644 --- a/Grid_config.h +++ b/Grid_config.h @@ -11,10 +11,10 @@ /* #undef AVX512 */ /* GRID_COMMS_MPI */ -/* #undef GRID_COMMS_MPI */ +#define GRID_COMMS_MPI 1 /* GRID_COMMS_NONE */ -#define GRID_COMMS_NONE 1 +/* #undef GRID_COMMS_NONE */ /* Define to 1 if you have the `gettimeofday' function. */ #define HAVE_GETTIMEOFDAY 1 diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h index 2910151c..e962f6ea 100644 --- a/Grid_cshift_common.h +++ b/Grid_cshift_common.h @@ -8,6 +8,7 @@ friend void Gather_plane_simple (Lattice &rhs,std::vector_rdimensions[dimension]; + printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout); if ( !rhs._grid->CheckerBoarded(dimension) ) { int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane @@ -18,6 +19,7 @@ friend void Gather_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ + printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout); buffer[bo++]=rhs._odata[so+o+b]; } o +=rhs._grid->_slice_stride[dimension]; diff --git a/Grid_main.cc b/Grid_main.cc index 8cf0dae3..13c3d9ee 100644 --- a/Grid_main.cc +++ b/Grid_main.cc @@ -1,10 +1,4 @@ #include "Grid.h" -#include "Grid_vRealD.h" -#include "Grid_vRealF.h" -#include "Grid_vComplexD.h" -#include "Grid_vComplexF.h" -#include "Grid_Cartesian.h" -#include "Grid_Lattice.h" using namespace std; using namespace Grid; @@ -16,13 +10,14 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector latt_size(4); + std::vector simd_layout(4); std::vector mpi_layout(4); - mpi_layout[0]=1; - mpi_layout[1]=1; - mpi_layout[2]=1; - mpi_layout[3]=1; + mpi_layout[0]=2; + mpi_layout[1]=2; + mpi_layout[2]=2; + mpi_layout[3]=2; #ifdef AVX512 for(int omp=128;omp<236;omp+=16){ diff --git a/Grid_math_types.h b/Grid_math_types.h index a2a3f97c..e3432645 100644 --- a/Grid_math_types.h +++ b/Grid_math_types.h @@ -922,9 +922,10 @@ template inline iMatrix operator - (Integer lhs,const iMatri { typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; iScalar ret=zero; + iScalar tmp; for(int c1=0;c1 ComplexF; typedef std::complex ComplexD; - typedef std::complex Complex; + inline RealF adj(const RealF & r){ return r; } inline RealF conj(const RealF & r){ return r; } @@ -54,7 +53,7 @@ namespace Grid { inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - inline Complex adj(const Complex& r ){ return(conj(r)); } + inline ComplexF adj(const ComplexF& r ){ return(conj(r)); } //conj already supported for complex inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} @@ -232,4 +231,52 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){ #include #include +namespace Grid { + + // NB: Template the following on "type Complex" and then implement *,+,- for + // ComplexF, ComplexD, RealF, RealD above to + // get full generality of binops with scalars. + inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } + inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } + + inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); } + inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); } + + inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); } + inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); } + + inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); } + inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); }; + inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); } + inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); } + inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); } + + // Default precision + typedef RealD Real; + typedef std::complex Complex; + + typedef vRealD vReal; + typedef vComplexD vComplex; +} #endif diff --git a/Grid_stencil.h b/Grid_stencil.h index e10cabaa..d8debac0 100644 --- a/Grid_stencil.h +++ b/Grid_stencil.h @@ -1,11 +1,15 @@ #ifndef GRID_STENCIL_H #define GRID_STENCIL_H + ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient -// gather to a point stencil code. CSHIFT is not the best way, so probably need +// gather to a point stencil code. CSHIFT is not the best way, so need // additional stencil support. // -// Stencil based code could pre-exchange haloes and use a table lookup for neighbours +// Stencil based code will pre-exchange haloes and use a table lookup for neighbours. +// This will be done with generality to allow easier efficient implementations. +// Overlap of comms and compute could be semi-automated by tabulating off-node connected, +// and // // Lattice could also allocate haloes which get used for stencil code. // @@ -35,329 +39,313 @@ namespace Grid { - class Stencil { + struct CommsRequest { + int words; + int unified_buffer_offset; + int tag; + int to_rank; + int from_rank; + } ; + + class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: - Stencil(GridBase *grid, - int npoints, - int checkerboard, - std::vector directions, - std::vector distances); - - void Stencil_local (int dimension,int shift,int cbmask); - void Stencil_comms (int dimension,int shift,int cbmask); - void Stencil_comms_simd(int dimension,int shift,int cbmask); - - // Will need to implement actions for - Copy_plane; - Copy_plane_permute; - Gather_plane; - - // The offsets to all neibours in stencil in each direction int _checkerboard; int _npoints; // Move to template param? GridBase * _grid; - - // Store these as SIMD Integer needed - // - // std::vector< iVector > _offsets; - // std::vector< iVector > _local; - // std::vector< iVector > _comm_buf_size; - // std::vector< iVector > _permute; - - std::vector > _offsets; - std::vector > _local; + + // npoints of these + std::vector _directions; + std::vector _distances; std::vector _comm_buf_size; - std::vector _permute; + std::vector _permute_type; - }; + // npoints x Osites() of these + std::vector > _offsets; + std::vector > _is_local; + std::vector > _permute; - Stencil::Stencil(GridBase *grid, - int npoints, - int checkerboard, - std::vector directions, - std::vector distances){ - - _npoints = npoints; - _grid = grid; - - for(int i=0;i CommsRequests; - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; + CartesianStencil(GridBase *grid, + int npoints, + int checkerboard, + const std::vector &directions, + const std::vector &distances); - _checkerboard = checkerboard; - // the permute type - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + // Add to tables for various cases; is this mistaken. only local if 1 proc in dim + // Can this be avoided with simpler coding of comms? + void Local (int point, int dimension,int shift,int cbmask); + void Comms (int point, int dimension,int shift,int cbmask); + void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute); + void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset); - int sshift[2]; + // Could allow a functional munging of the halo to another type during the comms. + // this could implement the 16bit/32bit/64bit compression. + template void HaloExchange(Lattice &source, + std::vector > &u_comm_buf) + { + // conformable(source._grid,_grid); + assert(source._grid==_grid); + if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size); + int u_comm_offset=0; - if ( !comm_dim ) { - sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); - sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + // Gather all comms buffers + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; - if ( sshift[0] == sshift[1] ) { - Stencil_local(dimension,shift,0x3); - } else { - Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Stencil_local(dimension,shift,0x2);// both with block stride loop iteration - } - } else if ( splice_dim ) { - sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); - sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + for(int point = 0 ; point < _npoints; point++) { + + printf("Point %d \n",point);fflush(stdout); + int dimension = _directions[point]; + int displacement = _distances[point]; - if ( sshift[0] == sshift[1] ) { - Stencil_comms_simd(dimension,shift,0x3); - } else { - Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration - } - } else { - // Cshift_comms(ret,rhs,dimension,shift); - sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); - sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); - if ( sshift[0] == sshift[1] ) { - Stencil_comms(dimension,shift,0x3); - } else { - Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes - Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + + // Map to always positive shift modulo global full dimension. + int shift = (displacement+fd)%fd; + + int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift); + assert (checkerboard== _checkerboard); + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + // Gather phase + int sshift [2]; + if ( comm_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + if ( sshift[0] == sshift[1] ) { + if (splice_dim) { + printf("splice 0x3 \n");fflush(stdout); + GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset); + } else { + printf("NO splice 0x3 \n");fflush(stdout); + GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset); + } + } else { + if(splice_dim){ + printf("splice 0x1,2 \n");fflush(stdout); + GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes + GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration + } else { + printf("NO splice 0x1,2 \n");fflush(stdout); + GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset); + GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset); + } + } } } } - } - - void Stencil::Stencil_local (int dimension,int shift,int cbmask) - { - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int gd = _grid->_gdimensions[dimension]; - - // Map to always positive shift modulo global full dimension. - shift = (shift+fd)%fd; - - // the permute type - int permute_dim =_grid->PermuteDim(dimension); - int permute_type=_grid->PermuteType(dimension); - - for(int x=0;x void GatherStartComms(Lattice &rhs,int dimension,int shift,int cbmask, + std::vector > &u_comm_buf, + int &u_comm_offset) + { + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; - int o = 0; - int bo = x * _grid->_ostride[dimension]; + GridBase *grid=_grid; + assert(rhs._grid==_grid); + // conformable(_grid,rhs._grid); + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + + std::vector > send_buf(buffer_size); // hmm... + std::vector > recv_buf(buffer_size); int cb= (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb); - int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); - int sx = (x+sshift)%rd; - - int permute_slice=0; - if(permute_dim){ - int wrap = sshift/rd; - int num = sshift%rd; - if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; - } - - if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type); - else Copy_plane (dimension,x,sx,cbmask); - - } - } - - void Stencil::Stencil_comms (int dimension,int shift,int cbmask) - { - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - GridBase *grid=_grid; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; - // FIXME: Do something with buffer_size?? - - int cb= (cbmask==0x2)? 1 : 0; - int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); - - for(int x=0;x= rd ); - int sx = (x+sshift)%rd; - int comm_proc = (x+sshift)/rd; - - if (!offnode) { + for(int x=0;x= rd ); + int sx = (x+sshift)%rd; + int comm_proc = (x+sshift)/rd; - } else { - - int words = send_buf.size(); - if (cbmask != 0x3) words=words>>1; - - int bytes = words * sizeof(vobj); - - Gather_plane_simple (dimension,sx,cbmask); - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - /* - grid->SendToRecvFrom((void *)&send_buf[0], - xmit_to_rank, - (void *)&recv_buf[0], - recv_from_rank, - bytes); - */ - Scatter_plane_simple (dimension,x,cbmask); - } - } - } - - void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask) - { - GridBase *grid=_grid; - const int Nsimd = _grid->Nsimd(); - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - int fd = _grid->_fdimensions[dimension]; - int rd = _grid->_rdimensions[dimension]; - int ld = _grid->_ldimensions[dimension]; - int simd_layout = _grid->_simd_layout[dimension]; - int comm_dim = _grid->_processors[dimension] >1 ; - - assert(comm_dim==1); - assert(simd_layout==2); - assert(shift>=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; - // FIXME do something with buffer size - - std::vector pointers(Nsimd); // - std::vector rpointers(Nsimd); // received pointers - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - - int cb = (cbmask==0x2)? 1 : 0; - int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); - - std::vector comm_offnode(simd_layout); - std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension - std::vector icoor(grid->Nd()); - - for(int x=0;x= ld; - comm_any = comm_any | comm_offnode[s]; - comm_proc[s] = shifted_x/ld; - } - - int o = 0; - int bo = x*grid->_ostride[dimension]; - int sx = (x+sshift)%rd; - - if ( comm_any ) { - - for(int i=0;iiCoorFromIindex(icoor,i); - s = icoor[dimension]; + printf("GatherStartComms offnode x %d\n",x);fflush(stdout); + int words = send_buf.size(); + if (cbmask != 0x3) words=words>>1; + + int bytes = words * sizeof(vobj); + + printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout); + Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); + + printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout); + + int rank = _grid->_processor; + int recv_from_rank; + int xmit_to_rank; + _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - if(comm_offnode[s]){ - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); - - /* - grid->SendToRecvFrom((void *)&send_buf_extract[i][0], - xmit_to_rank, - (void *)&recv_buf_extract[i][0], - recv_from_rank, - bytes); - */ - - rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; - - } else { - - rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; + // FIXME Implement asynchronous send & also avoid buffer copy + _grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout); + printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout); + for(int i=0;i>(permute_type+1)); - int PermuteMap; - for(int i=0;i + void GatherStartCommsSimd(Lattice &rhs,int dimension,int shift,int cbmask, + std::vector > &u_comm_buf, + int &u_comm_offset) + { + const int Nsimd = _grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int words = sizeof(vobj)/sizeof(vector_type); + + /* FIXME ALTERNATE BUFFER DETERMINATION */ + std::vector > send_buf_extract(Nsimd,std::vector(buffer_size*words) ); + std::vector > recv_buf_extract(Nsimd,std::vector(buffer_size*words) ); + int bytes = buffer_size*words*sizeof(scalar_type); + + std::vector pointers(Nsimd); // + std::vector rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb); + + std::vector comm_offnode(simd_layout); + std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension + std::vector icoor(_grid->Nd()); + + for(int x=0;x= ld; + comm_any = comm_any | comm_offnode[s]; + comm_proc[s] = shifted_x/ld; + } + + int o = 0; + int bo = x*_grid->_ostride[dimension]; + int sx = (x+sshift)%rd; + + if ( comm_any ) { + + for(int i=0;iiCoorFromIindex(icoor,i); + s = icoor[dimension]; + + if(comm_offnode[s]){ + + int rank = _grid->_processor; + int recv_from_rank; + int xmit_to_rank; + + _grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); + + + _grid->SendToRecvFrom((void *)&send_buf_extract[i][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + + rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; + + } else { + + rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; + + } + + } + + // Permute by swizzling pointers in merge + int permute_slice=0; + int lshift=sshift%ld; + int wrap =lshift/rd; + int num =lshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + int toggle_bit = (Nsimd>>(permute_type+1)); + int PermuteMap; + for(int i=0;i &directions, + const std::vector &distances) + : _offsets(npoints), + _is_local(npoints), + _comm_buf_size(npoints), + _permute_type(npoints), + _permute(npoints) + { + _npoints = npoints; + _grid = grid; + _directions = directions; + _distances = distances; + _unified_buffer_size=0; + _request_count =0; + CommsRequests.resize(0); + + int osites = _grid->oSites(); + + for(int i=0;i_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + _permute_type[point]=_grid->PermuteType(dimension); + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int sshift[2]; + + // Underlying approach. For each local site build + // up a table containing the npoint "neighbours" and whether they + // live in lattice or a comms buffer. + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + + if ( sshift[0] == sshift[1] ) { + Local(point,dimension,shift,0x3); + } else { + Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Local(point,dimension,shift,0x2);// both with block stride loop iteration + } + } else { // All permute extract done in comms phase prior to Stencil application + // So tables are the same whether comm_dim or splice_dim + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + if ( sshift[0] == sshift[1] ) { + Comms(point,dimension,shift,0x3); + } else { + Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Comms(point,dimension,shift,0x2);// both with block stride loop iteration + } + } + } + } + + + void CartesianStencil::Local (int point, int dimension,int shift,int cbmask) + { + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? 1 : 0; + + int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int permute_slice=0; + if(permute_dim){ + int wrap = sshift/rd; + int num = sshift%rd; + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + } + + CopyPlane(point,dimension,x,sx,cbmask,permute_slice); + + } + } + + void CartesianStencil::Comms (int point,int dimension,int shift,int cbmask) + { + GridBase *grid=_grid; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and + // send to one or more remote nodes. + + int cb= (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + + for(int x=0;x= rd ); + int sx = (x+sshift)%rd; + int comm_proc = (x+sshift)/rd; + + if (!offnode) { + + int permute_slice=0; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice); + + } else { + + int words = buffer_size; + if (cbmask != 0x3) words=words>>1; + + // GatherPlaneSimple (point,dimension,sx,cbmask); + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + + CommsRequest cr; + + cr.tag = _request_count++; + cr.words = words; + cr.unified_buffer_offset = _unified_buffer_size; + _unified_buffer_size += words; + grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank); + + CommsRequests.push_back(cr); + + ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase + + } + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int o = 0; // relative offset to base within plane + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + _offsets [point][lo+o+b]=ro+o+b; + _is_local[point][lo+o+b]=1; + _permute [point][lo+o+b]=permute; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b); + + if ( ocb&cbmask ) { + _offsets [point][lo+o+b]=ro+o+b; + _is_local[point][lo+o+b]=1; + _permute [point][lo+o+b]=permute; + } + + } + o +=_grid->_slice_stride[dimension]; + } + + } + } + // Routine builds up integer table for each site in _offsets, _is_local, _permute + void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset) + { + int rd = _grid->_rdimensions[dimension]; + + if ( !_grid->CheckerBoarded(dimension) ) { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + // Simple block stride gather of SIMD objects + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + _offsets [point][so+o+b]=offset+(bo++); + _is_local[point][so+o+b]=0; + _permute [point][so+o+b]=0; + } + o +=_grid->_slice_stride[dimension]; + } + + } else { + + int so = plane*_grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + + for(int n=0;n<_grid->_slice_nblock[dimension];n++){ + for(int b=0;b<_grid->_slice_block[dimension];b++){ + + int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + _offsets [point][so+o+b]=offset+(bo++); + _is_local[point][so+o+b]=0; + _permute [point][so+o+b]=0; + } + } + o +=_grid->_slice_stride[dimension]; + } + } + } +} diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h index 3278b15e..f1c335e8 100644 --- a/Grid_vComplexD.h +++ b/Grid_vComplexD.h @@ -194,6 +194,8 @@ namespace Grid { float b= imag(c); vsplat(ret,a,b); } + + friend inline void vsplat(vComplexD &ret,double rl,double ig){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(ig,rl,ig,rl); @@ -321,13 +323,14 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ { return l*r; } - inline vComplex trace(const vComplex &arg){ + inline vComplexD trace(const vComplexD &arg){ return arg; } ///////////////////////////////////////////////////////////////////////// //// Generic routine to promote object -> object //// Supports the array reordering transformation that gives me SIMD utilisation /////////////////////////////////////////////////////////////////////////// +/* template class object> inline object splat(objects){ object ret; @@ -338,6 +341,6 @@ inline object splat(objects){ } return ret; } - +*/ } #endif diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index b6b7ebe9..18cf1749 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -3,6 +3,16 @@ #include "Grid.h" namespace Grid { + + /* + inline void Print(const char *A,cvec c) { + float *fp=(float *)&c; + printf(A); + printf(" %le %le %le %le %le %le %le %le\n", + fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]); + } + */ + class vComplexF { // protected: @@ -108,7 +118,7 @@ namespace Grid { ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi - ret.v= _mm256_addsub_ps(ymm0,ymm1); // FIXME -- AVX2 could MAC + ret.v= _mm256_addsub_ps(ymm0,ymm1); #endif #ifdef SSE2 cvec ymm0,ymm1,ymm2; @@ -142,7 +152,7 @@ namespace Grid { //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch //////////////////////////////////////////////////////////////////////// - friend inline void vset(vComplexF &ret, Complex *a){ + friend inline void vset(vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real()); #endif @@ -216,12 +226,12 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ #endif } - friend inline vComplexF operator * (const Complex &a, vComplexF b){ + friend inline vComplexF operator * (const ComplexF &a, vComplexF b){ vComplexF va; vsplat(va,a); return va*b; } - friend inline vComplexF operator * (vComplexF b,const Complex &a){ + friend inline vComplexF operator * (vComplexF b,const ComplexF &a){ return a*b; } @@ -283,22 +293,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ } */ - // NB: Template the following on "type Complex" and then implement *,+,- for - // ComplexF, ComplexD, RealF, RealD above to - // get full generality of binops with scalars. - friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } - friend inline void sub (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } - friend inline void add (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } - friend inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const Complex *__restrict__ x){ *y = (*a)*(*x)+(*y); }; - friend inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) * (*r); } - friend inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) - (*r); } - friend inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) + (*r); } - /////////////////////// // Conjugate /////////////////////// + friend inline vComplexF conj(const vComplexF &in){ vComplexF ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) @@ -363,10 +362,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ }; - inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; } + inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) + { + return conj(l)*r; + } - typedef vComplexF vFComplex; - typedef vComplexF vComplex; inline void zeroit(vComplexF &z){ vzero(z);} inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r) diff --git a/Makefile.am b/Makefile.am index d11f28d5..e306eb56 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,11 +1,22 @@ # additional include paths necessary to compile the C++ library AM_CXXFLAGS = -I$(top_srcdir)/ + +extra_sources= +if BUILD_COMMS_MPI + extra_sources+=Grid_communicator_mpi.cc + extra_sources+=Grid_stencil_common.cc +endif +if BUILD_COMMS_NONE + extra_sources+=Grid_communicator_fake.cc + extra_sources+=Grid_stencil_common.cc +endif + # # Libraries # lib_LIBRARIES = libGrid.a -libGrid_a_SOURCES = Grid_init.cc +libGrid_a_SOURCES = Grid_init.cc $(extra_sources) # # Include files @@ -26,24 +37,18 @@ include_HEADERS = Grid_config.h\ Grid_cshift_common.h\ Grid_cshift_mpi.h\ Grid_cshift_none.h\ + Grid_stencil.h\ Grid_math_types.h # # Test code # -bin_PROGRAMS = Grid_main - -extra_sources= -if BUILD_COMMS_MPI - extra_sources+=Grid_communicator_mpi.cc -endif -if BUILD_COMMS_NONE - extra_sources+=Grid_communicator_fake.cc -endif - -Grid_main_SOURCES = \ - Grid_main.cc\ - $(extra_sources) +bin_PROGRAMS = Grid_main test_Grid_stencil +Grid_main_SOURCES = Grid_main.cc Grid_main_LDADD = libGrid.a + +test_Grid_stencil_SOURCES = test_Grid_Stencil.cc +test_Grid_stencil_LDADD = libGrid.a + diff --git a/Makefile.in b/Makefile.in index 16981e52..11b86688 100644 --- a/Makefile.in +++ b/Makefile.in @@ -88,9 +88,11 @@ POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : -bin_PROGRAMS = Grid_main$(EXEEXT) -@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc -@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc +@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc \ +@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.cc +@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc \ +@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.cc +bin_PROGRAMS = Grid_main$(EXEEXT) test_Grid_stencil$(EXEEXT) subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac @@ -142,18 +144,23 @@ am__v_AR_0 = @echo " AR " $@; am__v_AR_1 = libGrid_a_AR = $(AR) $(ARFLAGS) libGrid_a_LIBADD = -am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT) +am__libGrid_a_SOURCES_DIST = Grid_init.cc Grid_communicator_mpi.cc \ + Grid_stencil_common.cc Grid_communicator_fake.cc +@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT) \ +@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.$(OBJEXT) +@BUILD_COMMS_NONE_TRUE@am__objects_2 = \ +@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT) \ +@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.$(OBJEXT) +am__objects_3 = $(am__objects_1) $(am__objects_2) +am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT) $(am__objects_3) libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS) PROGRAMS = $(bin_PROGRAMS) -am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \ - Grid_communicator_fake.cc -@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT) -@BUILD_COMMS_NONE_TRUE@am__objects_2 = \ -@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT) -am__objects_3 = $(am__objects_1) $(am__objects_2) -am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3) +am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) Grid_main_OBJECTS = $(am_Grid_main_OBJECTS) Grid_main_DEPENDENCIES = libGrid.a +am_test_Grid_stencil_OBJECTS = test_Grid_Stencil.$(OBJEXT) +test_Grid_stencil_OBJECTS = $(am_test_Grid_stencil_OBJECTS) +test_Grid_stencil_DEPENDENCIES = libGrid.a AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) am__v_P_0 = false @@ -183,8 +190,10 @@ AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) am__v_CXXLD_0 = @echo " CXXLD " $@; am__v_CXXLD_1 = -SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES) -DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST) +SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES) \ + $(test_Grid_stencil_SOURCES) +DIST_SOURCES = $(am__libGrid_a_SOURCES_DIST) $(Grid_main_SOURCES) \ + $(test_Grid_stencil_SOURCES) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ @@ -329,12 +338,13 @@ top_srcdir = @top_srcdir@ # additional include paths necessary to compile the C++ library AM_CXXFLAGS = -I$(top_srcdir)/ +extra_sources = $(am__append_1) $(am__append_2) # # Libraries # lib_LIBRARIES = libGrid.a -libGrid_a_SOURCES = Grid_init.cc +libGrid_a_SOURCES = Grid_init.cc $(extra_sources) # # Include files @@ -355,14 +365,13 @@ include_HEADERS = Grid_config.h\ Grid_cshift_common.h\ Grid_cshift_mpi.h\ Grid_cshift_none.h\ + Grid_stencil.h\ Grid_math_types.h -extra_sources = $(am__append_1) $(am__append_2) -Grid_main_SOURCES = \ - Grid_main.cc\ - $(extra_sources) - +Grid_main_SOURCES = Grid_main.cc Grid_main_LDADD = libGrid.a +test_Grid_stencil_SOURCES = test_Grid_Stencil.cc +test_Grid_stencil_LDADD = libGrid.a all: Grid_config.h $(MAKE) $(AM_MAKEFLAGS) all-am @@ -499,6 +508,10 @@ Grid_main$(EXEEXT): $(Grid_main_OBJECTS) $(Grid_main_DEPENDENCIES) $(EXTRA_Grid_ @rm -f Grid_main$(EXEEXT) $(AM_V_CXXLD)$(CXXLINK) $(Grid_main_OBJECTS) $(Grid_main_LDADD) $(LIBS) +test_Grid_stencil$(EXEEXT): $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_DEPENDENCIES) $(EXTRA_test_Grid_stencil_DEPENDENCIES) + @rm -f test_Grid_stencil$(EXEEXT) + $(AM_V_CXXLD)$(CXXLINK) $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_LDADD) $(LIBS) + mostlyclean-compile: -rm -f *.$(OBJEXT) @@ -509,6 +522,8 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_stencil_common.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_Grid_Stencil.Po@am__quote@ .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< diff --git a/TODO b/TODO index 5297e213..665e3a55 100644 --- a/TODO +++ b/TODO @@ -3,7 +3,7 @@ * Replace vset with a call to merge.; * care in Gmerge,Gextract over set. * Const audit -* extract / merge extra implementation removal +* extract / merge extra implementation removal * Conditional execution, where etc... -----DONE, simple test diff --git a/test_Grid_jacobi.cc b/test_Grid_jacobi.cc new file mode 100644 index 00000000..95e808f1 --- /dev/null +++ b/test_Grid_jacobi.cc @@ -0,0 +1,206 @@ +#include "Grid.h" + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +class LinearOperator { +public: + operator () (const Lattice&src,Lattice &result) {}; +}; + +template +class LinearOperatorJacobi : public LinearOperator +{ + CartesianStencil *Stencil; + GridBase *_grid; + std::vector > comm_buf; + LinearOperatorJacobi(GridCartesian *grid) + { + _grid = grid; + int npoint=9; + std::vector directions(npoint); + std::vector displacements(npoint); + for(int mu=0;mu<4;mu++){ + for(int mp=0;mp<2;mp++){ + int dir = 2*mu+mp; + directions[dir] = mu; + displacements[dir]= -1+2*mp; + } + } + directions[8] = 0; + displacements[8] = 0; + + Stencil = new CartesianStencil(grid,npoint,0,directions,displacements); + + comm_buf.resize(Stencil->_unified_buffer_size); + + } + operator () (const Lattice&src,Lattice &result) + { + const int npoint=9; + printf("calling halo exchange\n");fflush(stdout); + myStencil.HaloExchange(Foo,comm_buf); + vobj tmp; + vobj + for(int i=0;i<_grid->oSites();i++){ + for(int p=0;p_offsets [p][i]; + int local = Stencil->_is_local[p][i]; + int ptype = Stencil->_permute_type[p]; + int perm = Stencil->_permute[0][i]; + + vobj *nbr; + if ( local && perm ){ + permute(tmp,src._odata[offset],ptype); + nbr = &tmp; + } else if (local) { + nbr = &src._odata[offset]; + } else { + nbr = &comm_buf[offset]; + } + result[i] = result[i]+*nbr; + } + } + } + ~LinearOperatorJacobi() + { + delete Stencil; + } +} + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size (4); + std::vector simd_layout(4); + std::vector mpi_layout (4); + + int omp=1; + int lat=8; + + mpi_layout[0]=1; + mpi_layout[1]=2; + mpi_layout[2]=1; + mpi_layout[3]=1; + + latt_size[0] = lat; + latt_size[1] = lat; + latt_size[2] = lat; + latt_size[3] = lat; + double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + +#ifdef AVX512 + simd_layout[0] = 1; + simd_layout[1] = 2; + simd_layout[2] = 2; + simd_layout[3] = 2; +#endif +#if defined (AVX1)|| defined (AVX2) + simd_layout[0] = 1; + simd_layout[1] = 1; + simd_layout[2] = 2; + simd_layout[3] = 2; +#endif +#if defined (SSE2) + simd_layout[0] = 1; + simd_layout[1] = 1; + simd_layout[2] = 1; + simd_layout[3] = 2; +#endif + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); + + LatticeColourMatrix Foo(&Fine); + LatticeColourMatrix Bar(&Fine); + LatticeColourMatrix Check(&Fine); + LatticeColourMatrix Diff(&Fine); + + random(Foo); + gaussian(Bar); + + + for(int dir=0;dir<4;dir++){ + for(int disp=0;disp directions(npoint,dir); + std::vector displacements(npoint,disp); + + CartesianStencil myStencil(&Fine,npoint,0,directions,displacements); + + printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp); + std::vector ocoor(4); + for(int o=0;o > comm_buf(myStencil._unified_buffer_size); + printf("calling halo exchange\n");fflush(stdout); + myStencil.HaloExchange(Foo,comm_buf); + + Bar = Cshift(Foo,dir,disp); + + // Implement a stencil code that should agree with cshift! + for(int i=0;ioSites();i++){ + + int offset = myStencil._offsets [0][i]; + int local = myStencil._is_local[0][i]; + int permute_type = myStencil._permute_type[0]; + int perm =myStencil._permute[0][i]; + if ( local && perm ) + permute(Check._odata[i],Foo._odata[offset],permute_type); + else if (local) + Check._odata[i] = Foo._odata[offset]; + else + Check._odata[i] = comm_buf[offset]; + + + } + + + std::vector coor(4); + for(coor[3]=0;coor[3] 0 ){ + printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", + coor[0],coor[1],coor[2],coor[3],r,c, + nn, + real(check._internal._internal[r][c]), + real(bar._internal._internal[r][c]) + ); + } + }} + + }}}} + + } + } + + Grid_finalize(); +} diff --git a/test_Grid_stencil.cc b/test_Grid_stencil.cc new file mode 100644 index 00000000..8077562f --- /dev/null +++ b/test_Grid_stencil.cc @@ -0,0 +1,154 @@ +#include "Grid.h" + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size (4); + std::vector simd_layout(4); + std::vector mpi_layout (4); + + int omp=1; + int lat=8; + + mpi_layout[0]=1; + mpi_layout[1]=1; + mpi_layout[2]=1; + mpi_layout[3]=1; + + latt_size[0] = lat; + latt_size[1] = lat; + latt_size[2] = lat; + latt_size[3] = lat; + double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + +#ifdef AVX512 + simd_layout[0] = 1; + simd_layout[1] = 1; + simd_layout[2] = 2; + simd_layout[3] = 2; +#endif +#if defined (AVX1)|| defined (AVX2) + simd_layout[0] = 1; + simd_layout[1] = 1; + simd_layout[2] = 1; + simd_layout[3] = 2; +#endif +#if defined (SSE2) + simd_layout[0] = 1; + simd_layout[1] = 1; + simd_layout[2] = 1; + simd_layout[3] = 2; +#endif + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); + + LatticeColourMatrix Foo(&Fine); + LatticeColourMatrix Bar(&Fine); + LatticeColourMatrix Check(&Fine); + LatticeColourMatrix Diff(&Fine); + + random(Foo); + gaussian(Bar); + + + for(int dir=0;dir<4;dir++){ + for(int disp=0;disp directions(npoint,dir); + std::vector displacements(npoint,disp); + + CartesianStencil myStencil(&Fine,npoint,0,directions,displacements); + + printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp); + std::vector ocoor(4); + for(int o=0;o > comm_buf(myStencil._unified_buffer_size); + printf("calling halo exchange\n");fflush(stdout); + myStencil.HaloExchange(Foo,comm_buf); + + Bar = Cshift(Foo,dir,disp); + + // Implement a stencil code that should agree with cshift! + for(int i=0;ioSites();i++){ + + int offset = myStencil._offsets [0][i]; + int local = myStencil._is_local[0][i]; + int permute_type = myStencil._permute_type[0]; + int perm =myStencil._permute[0][i]; + if ( local && perm ) + permute(Check._odata[i],Foo._odata[offset],permute_type); + else if (local) + Check._odata[i] = Foo._odata[offset]; + else + Check._odata[i] = comm_buf[offset]; + + + } + + Real nrmC = norm2(Check); + Real nrmB = norm2(Bar); + Real nrm = norm2(Check-Bar); + printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout); + + Real snrmC =0; + Real snrmB =0; + Real snrm =0; + + std::vector coor(4); + for(coor[3]=0;coor[3] 0){ + printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", + coor[0],coor[1],coor[2],coor[3],r,c, + nn, + real(check._internal._internal[r][c]), + real(bar._internal._internal[r][c]) + ); + } + snrmC=snrmC+real(conj(check._internal._internal[r][c])*check._internal._internal[r][c]); + snrmB=snrmB+real(conj(bar._internal._internal[r][c])*bar._internal._internal[r][c]); + snrm=snrm+nn; + }} + + }}}} + + printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout); + + + } + } + + Grid_finalize(); +}