1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Stencil code pretty much shaken out.

Beginning of inner product and norm2.
This commit is contained in:
Peter Boyle 2015-04-14 20:22:04 +01:00
parent 977c7721d5
commit 2d54ef2a52
16 changed files with 1050 additions and 363 deletions

1
Grid.h
View File

@ -47,6 +47,7 @@
#include <Grid_Cartesian.h>
#include <Grid_Lattice.h>
#include <Grid_comparison.h>
#include <Grid_stencil.h>
#include <Grid_QCD.h>
namespace Grid {

View File

@ -407,6 +407,18 @@ public:
return ret;
}
template<class vobj>
inline RealD norm2(const Lattice<vobj> &arg){
typedef typename vobj::scalar_type scalar;
decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
scalar nrm;
for(int ss=0;ss<arg._grid->oSites(); ss++){
vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
}
nrm = Reduce(TensorRemove(vnrm));
return real(nrm);
}
}
#endif

View File

@ -11,10 +11,10 @@
/* #undef AVX512 */
/* GRID_COMMS_MPI */
/* #undef GRID_COMMS_MPI */
#define GRID_COMMS_MPI 1
/* GRID_COMMS_NONE */
#define GRID_COMMS_NONE 1
/* #undef GRID_COMMS_NONE */
/* Define to 1 if you have the `gettimeofday' function. */
#define HAVE_GETTIMEOFDAY 1

View File

@ -8,6 +8,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
{
int rd = rhs._grid->_rdimensions[dimension];
printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
if ( !rhs._grid->CheckerBoarded(dimension) ) {
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
@ -18,6 +19,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
#pragma omp parallel for collapse(2)
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
buffer[bo++]=rhs._odata[so+o+b];
}
o +=rhs._grid->_slice_stride[dimension];

View File

@ -1,10 +1,4 @@
#include "Grid.h"
#include "Grid_vRealD.h"
#include "Grid_vRealF.h"
#include "Grid_vComplexD.h"
#include "Grid_vComplexF.h"
#include "Grid_Cartesian.h"
#include "Grid_Lattice.h"
using namespace std;
using namespace Grid;
@ -16,13 +10,14 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
std::vector<int> latt_size(4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout(4);
mpi_layout[0]=1;
mpi_layout[1]=1;
mpi_layout[2]=1;
mpi_layout[3]=1;
mpi_layout[0]=2;
mpi_layout[1]=2;
mpi_layout[2]=2;
mpi_layout[3]=2;
#ifdef AVX512
for(int omp=128;omp<236;omp+=16){

View File

@ -922,9 +922,10 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
{
typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret=zero;
iScalar<ret_t> tmp;
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}

View File

@ -26,11 +26,10 @@ namespace Grid {
typedef float RealF;
typedef double RealD;
typedef RealF Real;
typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex;
inline RealF adj(const RealF & r){ return r; }
inline RealF conj(const RealF & r){ return r; }
@ -54,7 +53,7 @@ namespace Grid {
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline Complex adj(const Complex& r ){ return(conj(r)); }
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
//conj already supported for complex
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
@ -232,4 +231,52 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
#include <Grid_vComplexF.h>
#include <Grid_vComplexD.h>
namespace Grid {
// NB: Template the following on "type Complex" and then implement *,+,- for
// ComplexF, ComplexD, RealF, RealD above to
// get full generality of binops with scalars.
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
// Default precision
typedef RealD Real;
typedef std::complex<Real> Complex;
typedef vRealD vReal;
typedef vComplexD vComplex;
}
#endif

View File

@ -1,11 +1,15 @@
#ifndef GRID_STENCIL_H
#define GRID_STENCIL_H
//////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so probably need
// gather to a point stencil code. CSHIFT is not the best way, so need
// additional stencil support.
//
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
// This will be done with generality to allow easier efficient implementations.
// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
// and
//
// Lattice <foo> could also allocate haloes which get used for stencil code.
//
@ -35,329 +39,313 @@
namespace Grid {
class Stencil {
struct CommsRequest {
int words;
int unified_buffer_offset;
int tag;
int to_rank;
int from_rank;
} ;
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
Stencil(GridBase *grid,
int npoints,
int checkerboard,
std::vector<int> directions,
std::vector<int> distances);
void Stencil_local (int dimension,int shift,int cbmask);
void Stencil_comms (int dimension,int shift,int cbmask);
void Stencil_comms_simd(int dimension,int shift,int cbmask);
// Will need to implement actions for
Copy_plane;
Copy_plane_permute;
Gather_plane;
// The offsets to all neibours in stencil in each direction
int _checkerboard;
int _npoints; // Move to template param?
GridBase * _grid;
// Store these as SIMD Integer needed
//
// std::vector< iVector<Integer, Npoint> > _offsets;
// std::vector< iVector<Integer, Npoint> > _local;
// std::vector< iVector<Integer, Npoint> > _comm_buf_size;
// std::vector< iVector<Integer, Npoint> > _permute;
std::vector<std::vector<int> > _offsets;
std::vector<std::vector<int> > _local;
// npoints of these
std::vector<int> _directions;
std::vector<int> _distances;
std::vector<int> _comm_buf_size;
std::vector<int> _permute;
std::vector<int> _permute_type;
};
// npoints x Osites() of these
std::vector<std::vector<int> > _offsets;
std::vector<std::vector<int> > _is_local;
std::vector<std::vector<int> > _permute;
Stencil::Stencil(GridBase *grid,
int npoints,
int checkerboard,
std::vector<int> directions,
std::vector<int> distances){
_npoints = npoints;
_grid = grid;
for(int i=0;i<npoints;i++){
int _unified_buffer_size;
int _request_count;
int dimension = directions[i];
int displacement = distances[i];
std::vector<CommsRequest> CommsRequests;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
// Can this be avoided with simpler coding of comms?
void Local (int point, int dimension,int shift,int cbmask);
void Comms (int point, int dimension,int shift,int cbmask);
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
int sshift[2];
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj> void HaloExchange(Lattice<vobj> &source,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0;
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
// Gather all comms buffers
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
if ( sshift[0] == sshift[1] ) {
Stencil_local(dimension,shift,0x3);
} else {
Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
}
} else if ( splice_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
for(int point = 0 ; point < _npoints; point++) {
printf("Point %d \n",point);fflush(stdout);
int dimension = _directions[point];
int displacement = _distances[point];
if ( sshift[0] == sshift[1] ) {
Stencil_comms_simd(dimension,shift,0x3);
} else {
Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
}
} else {
// Cshift_comms(ret,rhs,dimension,shift);
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Stencil_comms(dimension,shift,0x3);
} else {
Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (displacement+fd)%fd;
int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
assert (checkerboard== _checkerboard);
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
// Gather phase
int sshift [2];
if ( comm_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
printf("splice 0x3 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
} else {
printf("NO splice 0x3 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
}
} else {
if(splice_dim){
printf("splice 0x1,2 \n");fflush(stdout);
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
} else {
printf("NO splice 0x1,2 \n");fflush(stdout);
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
}
}
}
}
}
}
void Stencil::Stencil_local (int dimension,int shift,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
int permute_type=_grid->PermuteType(dimension);
for(int x=0;x<rd;x++){
template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int o = 0;
int bo = x * _grid->_ostride[dimension];
GridBase *grid=_grid;
assert(rhs._grid==_grid);
// conformable(_grid,rhs._grid);
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
else Copy_plane (dimension,x,sx,cbmask);
}
}
void Stencil::Stencil_comms (int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
// FIXME: Do something with buffer_size??
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (!offnode) {
for(int x=0;x<rd;x++){
Copy_plane(dimension,x,sx,cbmask);
printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
} else {
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
/*
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
*/
Scatter_plane_simple (dimension,x,cbmask);
}
}
}
void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
{
GridBase *grid=_grid;
const int Nsimd = _grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=_grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// FIXME do something with buffer size
std::vector<scalar_type *> pointers(Nsimd); //
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(grid->Nd());
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
}
int o = 0;
int bo = x*grid->_ostride[dimension];
int sx = (x+sshift)%rd;
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
if (offnode) {
int s;
grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_offnode[s]){
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
/*
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
*/
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
// FIXME Implement asynchronous send & also avoid buffer copy
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
for(int i=0;i<buffer_size;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
}
u_comm_offset+=buffer_size;
printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[i] = rpointers[PermuteMap];
} else {
pointers[i] = rpointers[i];
}
}
Scatter_plane_merge(pointers,dimension,x,cbmask);
} else {
int permute_slice=0;
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
}
}
};
template<class vobj>
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
int &u_comm_offset)
{
const int Nsimd = _grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=_grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
/* FIXME ALTERNATE BUFFER DETERMINATION */
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
int bytes = buffer_size*words*sizeof(scalar_type);
std::vector<scalar_type *> pointers(Nsimd); //
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
std::vector<int> comm_offnode(simd_layout);
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
std::vector<int> icoor(_grid->Nd());
for(int x=0;x<rd;x++){
int comm_any = 0;
for(int s=0;s<simd_layout;s++) {
int shifted_x = x+s*rd+sshift;
comm_offnode[s] = shifted_x >= ld;
comm_any = comm_any | comm_offnode[s];
comm_proc[s] = shifted_x/ld;
}
int o = 0;
int bo = x*_grid->_ostride[dimension];
int sx = (x+sshift)%rd;
if ( comm_any ) {
for(int i=0;i<Nsimd;i++){
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int s;
_grid->iCoorFromIindex(icoor,i);
s = icoor[dimension];
if(comm_offnode[s]){
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
_grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
} else {
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
}
}
// Permute by swizzling pointers in merge
int permute_slice=0;
int lshift=sshift%ld;
int wrap =lshift/rd;
int num =lshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
int toggle_bit = (Nsimd>>(permute_type+1));
int PermuteMap;
for(int i=0;i<Nsimd;i++){
if ( permute_slice ) {
PermuteMap=i^toggle_bit;
pointers[i] = rpointers[PermuteMap];
} else {
pointers[i] = rpointers[i];
}
}
// Here we don't want to scatter, just place into a buffer.
for(int i=0;i<buffer_size;i++){
merge(u_comm_buf[u_comm_offset+i],pointers);
}
}
}
}
};
}
#endif

258
Grid_stencil_common.cc Normal file
View File

@ -0,0 +1,258 @@
#include "Grid.h"
namespace Grid {
CartesianStencil::CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _offsets(npoints),
_is_local(npoints),
_comm_buf_size(npoints),
_permute_type(npoints),
_permute(npoints)
{
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
CommsRequests.resize(0);
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_offsets[i].resize( osites);
_is_local[i].resize(osites);
_permute[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
}
}
}
}
void CartesianStencil::Local (int point, int dimension,int shift,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? 1 : 0;
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
}
}
void CartesianStencil::Comms (int point,int dimension,int shift,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? 1 : 0;
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = ( x+sshift >= rd );
int sx = (x+sshift)%rd;
int comm_proc = (x+sshift)/rd;
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
CommsRequest cr;
cr.tag = _request_count++;
cr.words = words;
cr.unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
CommsRequests.push_back(cr);
ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_offsets [point][lo+o+b]=ro+o+b;
_is_local[point][lo+o+b]=1;
_permute [point][lo+o+b]=permute;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_offsets [point][lo+o+b]=ro+o+b;
_is_local[point][lo+o+b]=1;
_permute [point][lo+o+b]=permute;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_offsets [point][so+o+b]=offset+(bo++);
_is_local[point][so+o+b]=0;
_permute [point][so+o+b]=0;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_offsets [point][so+o+b]=offset+(bo++);
_is_local[point][so+o+b]=0;
_permute [point][so+o+b]=0;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
}

View File

@ -194,6 +194,8 @@ namespace Grid {
float b= imag(c);
vsplat(ret,a,b);
}
friend inline void vsplat(vComplexD &ret,double rl,double ig){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_pd(ig,rl,ig,rl);
@ -321,13 +323,14 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
{
return l*r;
}
inline vComplex trace(const vComplex &arg){
inline vComplexD trace(const vComplexD &arg){
return arg;
}
/////////////////////////////////////////////////////////////////////////
//// Generic routine to promote object<complex> -> object<vcomplex>
//// Supports the array reordering transformation that gives me SIMD utilisation
///////////////////////////////////////////////////////////////////////////
/*
template<template<class> class object>
inline object<vComplex> splat(object<Complex >s){
object<vComplex> ret;
@ -338,6 +341,6 @@ inline object<vComplex> splat(object<Complex >s){
}
return ret;
}
*/
}
#endif

View File

@ -3,6 +3,16 @@
#include "Grid.h"
namespace Grid {
/*
inline void Print(const char *A,cvec c) {
float *fp=(float *)&c;
printf(A);
printf(" %le %le %le %le %le %le %le %le\n",
fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
}
*/
class vComplexF {
// protected:
@ -108,7 +118,7 @@ namespace Grid {
ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
ret.v= _mm256_addsub_ps(ymm0,ymm1); // FIXME -- AVX2 could MAC
ret.v= _mm256_addsub_ps(ymm0,ymm1);
#endif
#ifdef SSE2
cvec ymm0,ymm1,ymm2;
@ -142,7 +152,7 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend inline void vset(vComplexF &ret, Complex *a){
friend inline void vset(vComplexF &ret, ComplexF *a){
#if defined (AVX1)|| defined (AVX2)
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
#endif
@ -216,12 +226,12 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
#endif
}
friend inline vComplexF operator * (const Complex &a, vComplexF b){
friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
vComplexF va;
vsplat(va,a);
return va*b;
}
friend inline vComplexF operator * (vComplexF b,const Complex &a){
friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
return a*b;
}
@ -283,22 +293,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
}
*/
// NB: Template the following on "type Complex" and then implement *,+,- for
// ComplexF, ComplexD, RealF, RealD above to
// get full generality of binops with scalars.
friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
friend inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const Complex *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) + (*r); }
///////////////////////
// Conjugate
///////////////////////
friend inline vComplexF conj(const vComplexF &in){
vComplexF ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
@ -363,10 +362,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
};
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)
{
return conj(l)*r;
}
typedef vComplexF vFComplex;
typedef vComplexF vComplex;
inline void zeroit(vComplexF &z){ vzero(z);}
inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)

View File

@ -1,11 +1,22 @@
# additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/
extra_sources=
if BUILD_COMMS_MPI
extra_sources+=Grid_communicator_mpi.cc
extra_sources+=Grid_stencil_common.cc
endif
if BUILD_COMMS_NONE
extra_sources+=Grid_communicator_fake.cc
extra_sources+=Grid_stencil_common.cc
endif
#
# Libraries
#
lib_LIBRARIES = libGrid.a
libGrid_a_SOURCES = Grid_init.cc
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
#
# Include files
@ -26,24 +37,18 @@ include_HEADERS = Grid_config.h\
Grid_cshift_common.h\
Grid_cshift_mpi.h\
Grid_cshift_none.h\
Grid_stencil.h\
Grid_math_types.h
#
# Test code
#
bin_PROGRAMS = Grid_main
extra_sources=
if BUILD_COMMS_MPI
extra_sources+=Grid_communicator_mpi.cc
endif
if BUILD_COMMS_NONE
extra_sources+=Grid_communicator_fake.cc
endif
Grid_main_SOURCES = \
Grid_main.cc\
$(extra_sources)
bin_PROGRAMS = Grid_main test_Grid_stencil
Grid_main_SOURCES = Grid_main.cc
Grid_main_LDADD = libGrid.a
test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
test_Grid_stencil_LDADD = libGrid.a

View File

@ -88,9 +88,11 @@ POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
bin_PROGRAMS = Grid_main$(EXEEXT)
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc \
@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.cc
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc \
@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.cc
bin_PROGRAMS = Grid_main$(EXEEXT) test_Grid_stencil$(EXEEXT)
subdir = .
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@ -142,18 +144,23 @@ am__v_AR_0 = @echo " AR " $@;
am__v_AR_1 =
libGrid_a_AR = $(AR) $(ARFLAGS)
libGrid_a_LIBADD =
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
am__libGrid_a_SOURCES_DIST = Grid_init.cc Grid_communicator_mpi.cc \
Grid_stencil_common.cc Grid_communicator_fake.cc
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT) \
@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.$(OBJEXT)
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT) \
@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.$(OBJEXT)
am__objects_3 = $(am__objects_1) $(am__objects_2)
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT) $(am__objects_3)
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
PROGRAMS = $(bin_PROGRAMS)
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
Grid_communicator_fake.cc
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT)
am__objects_3 = $(am__objects_1) $(am__objects_2)
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT)
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
Grid_main_DEPENDENCIES = libGrid.a
am_test_Grid_stencil_OBJECTS = test_Grid_Stencil.$(OBJEXT)
test_Grid_stencil_OBJECTS = $(am_test_Grid_stencil_OBJECTS)
test_Grid_stencil_DEPENDENCIES = libGrid.a
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
@ -183,8 +190,10 @@ AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
am__v_CXXLD_0 = @echo " CXXLD " $@;
am__v_CXXLD_1 =
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES) \
$(test_Grid_stencil_SOURCES)
DIST_SOURCES = $(am__libGrid_a_SOURCES_DIST) $(Grid_main_SOURCES) \
$(test_Grid_stencil_SOURCES)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
@ -329,12 +338,13 @@ top_srcdir = @top_srcdir@
# additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/
extra_sources = $(am__append_1) $(am__append_2)
#
# Libraries
#
lib_LIBRARIES = libGrid.a
libGrid_a_SOURCES = Grid_init.cc
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
#
# Include files
@ -355,14 +365,13 @@ include_HEADERS = Grid_config.h\
Grid_cshift_common.h\
Grid_cshift_mpi.h\
Grid_cshift_none.h\
Grid_stencil.h\
Grid_math_types.h
extra_sources = $(am__append_1) $(am__append_2)
Grid_main_SOURCES = \
Grid_main.cc\
$(extra_sources)
Grid_main_SOURCES = Grid_main.cc
Grid_main_LDADD = libGrid.a
test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
test_Grid_stencil_LDADD = libGrid.a
all: Grid_config.h
$(MAKE) $(AM_MAKEFLAGS) all-am
@ -499,6 +508,10 @@ Grid_main$(EXEEXT): $(Grid_main_OBJECTS) $(Grid_main_DEPENDENCIES) $(EXTRA_Grid_
@rm -f Grid_main$(EXEEXT)
$(AM_V_CXXLD)$(CXXLINK) $(Grid_main_OBJECTS) $(Grid_main_LDADD) $(LIBS)
test_Grid_stencil$(EXEEXT): $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_DEPENDENCIES) $(EXTRA_test_Grid_stencil_DEPENDENCIES)
@rm -f test_Grid_stencil$(EXEEXT)
$(AM_V_CXXLD)$(CXXLINK) $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_LDADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
@ -509,6 +522,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_stencil_common.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_Grid_Stencil.Po@am__quote@
.cc.o:
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<

2
TODO
View File

@ -3,7 +3,7 @@
* Replace vset with a call to merge.;
* care in Gmerge,Gextract over set.
* Const audit
* extract / merge extra implementation removal
* extract / merge extra implementation removal
* Conditional execution, where etc... -----DONE, simple test

206
test_Grid_jacobi.cc Normal file
View File

@ -0,0 +1,206 @@
#include "Grid.h"
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class vobj>
class LinearOperator {
public:
operator () (const Lattice<vobj>&src,Lattice<vobj> &result) {};
};
template<class vobj>
class LinearOperatorJacobi : public LinearOperator<vobj>
{
CartesianStencil *Stencil;
GridBase *_grid;
std::vector<vobj,alignedAllocator<vobj> > comm_buf;
LinearOperatorJacobi(GridCartesian *grid)
{
_grid = grid;
int npoint=9;
std::vector<int> directions(npoint);
std::vector<int> displacements(npoint);
for(int mu=0;mu<4;mu++){
for(int mp=0;mp<2;mp++){
int dir = 2*mu+mp;
directions[dir] = mu;
displacements[dir]= -1+2*mp;
}
}
directions[8] = 0;
displacements[8] = 0;
Stencil = new CartesianStencil(grid,npoint,0,directions,displacements);
comm_buf.resize(Stencil->_unified_buffer_size);
}
operator () (const Lattice<vobj>&src,Lattice<vobj> &result)
{
const int npoint=9;
printf("calling halo exchange\n");fflush(stdout);
myStencil.HaloExchange(Foo,comm_buf);
vobj tmp;
vobj
for(int i=0;i<_grid->oSites();i++){
for(int p=0;p<npoint;p++){
int offset = Stencil->_offsets [p][i];
int local = Stencil->_is_local[p][i];
int ptype = Stencil->_permute_type[p];
int perm = Stencil->_permute[0][i];
vobj *nbr;
if ( local && perm ){
permute(tmp,src._odata[offset],ptype);
nbr = &tmp;
} else if (local) {
nbr = &src._odata[offset];
} else {
nbr = &comm_buf[offset];
}
result[i] = result[i]+*nbr;
}
}
}
~LinearOperatorJacobi()
{
delete Stencil;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size (4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout (4);
int omp=1;
int lat=8;
mpi_layout[0]=1;
mpi_layout[1]=2;
mpi_layout[2]=1;
mpi_layout[3]=1;
latt_size[0] = lat;
latt_size[1] = lat;
latt_size[2] = lat;
latt_size[3] = lat;
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
#ifdef AVX512
simd_layout[0] = 1;
simd_layout[1] = 2;
simd_layout[2] = 2;
simd_layout[3] = 2;
#endif
#if defined (AVX1)|| defined (AVX2)
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 2;
simd_layout[3] = 2;
#endif
#if defined (SSE2)
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 1;
simd_layout[3] = 2;
#endif
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
LatticeColourMatrix Foo(&Fine);
LatticeColourMatrix Bar(&Fine);
LatticeColourMatrix Check(&Fine);
LatticeColourMatrix Diff(&Fine);
random(Foo);
gaussian(Bar);
for(int dir=0;dir<4;dir++){
for(int disp=0;disp<Fine._rdimensions[dir];disp++){
// start to test the Cartesian npoint stencil infrastructure
int npoint=;
std::vector<int> directions(npoint,dir);
std::vector<int> displacements(npoint,disp);
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
std::vector<int> ocoor(4);
for(int o=0;o<Fine.oSites();o++){
Fine.oCoorFromOindex(ocoor,o);
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
int nbr = Fine.oIndexReduced(ocoor);
int stcl= myStencil._offsets[0][o];
if(nbr!=stcl){
printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
}
}
printf("allocating %d buffers\n",myStencil._unified_buffer_size);
fflush(stdout);
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
printf("calling halo exchange\n");fflush(stdout);
myStencil.HaloExchange(Foo,comm_buf);
Bar = Cshift(Foo,dir,disp);
// Implement a stencil code that should agree with cshift!
for(int i=0;i<Check._grid->oSites();i++){
int offset = myStencil._offsets [0][i];
int local = myStencil._is_local[0][i];
int permute_type = myStencil._permute_type[0];
int perm =myStencil._permute[0][i];
if ( local && perm )
permute(Check._odata[i],Foo._odata[offset],permute_type);
else if (local)
Check._odata[i] = Foo._odata[offset];
else
Check._odata[i] = comm_buf[offset];
}
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
Complex diff;
ColourMatrix check,bar;
peekSite(check,Check,coor);
peekSite(bar,Bar,coor);
for(int r=0;r<3;r++){
for(int c=0;c<3;c++){
diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
double nn=real(conj(diff)*diff);
if ( nn > 0 ){
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
coor[0],coor[1],coor[2],coor[3],r,c,
nn,
real(check._internal._internal[r][c]),
real(bar._internal._internal[r][c])
);
}
}}
}}}}
}
}
Grid_finalize();
}

154
test_Grid_stencil.cc Normal file
View File

@ -0,0 +1,154 @@
#include "Grid.h"
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size (4);
std::vector<int> simd_layout(4);
std::vector<int> mpi_layout (4);
int omp=1;
int lat=8;
mpi_layout[0]=1;
mpi_layout[1]=1;
mpi_layout[2]=1;
mpi_layout[3]=1;
latt_size[0] = lat;
latt_size[1] = lat;
latt_size[2] = lat;
latt_size[3] = lat;
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
#ifdef AVX512
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 2;
simd_layout[3] = 2;
#endif
#if defined (AVX1)|| defined (AVX2)
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 1;
simd_layout[3] = 2;
#endif
#if defined (SSE2)
simd_layout[0] = 1;
simd_layout[1] = 1;
simd_layout[2] = 1;
simd_layout[3] = 2;
#endif
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
LatticeColourMatrix Foo(&Fine);
LatticeColourMatrix Bar(&Fine);
LatticeColourMatrix Check(&Fine);
LatticeColourMatrix Diff(&Fine);
random(Foo);
gaussian(Bar);
for(int dir=0;dir<4;dir++){
for(int disp=0;disp<Fine._rdimensions[dir];disp++){
// start to test the Cartesian npoint stencil infrastructure
int npoint=1;
std::vector<int> directions(npoint,dir);
std::vector<int> displacements(npoint,disp);
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
std::vector<int> ocoor(4);
for(int o=0;o<Fine.oSites();o++){
Fine.oCoorFromOindex(ocoor,o);
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
int nbr = Fine.oIndexReduced(ocoor);
int stcl= myStencil._offsets[0][o];
if(nbr!=stcl){
printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
}
}
printf("allocating %d buffers\n",myStencil._unified_buffer_size);
fflush(stdout);
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
printf("calling halo exchange\n");fflush(stdout);
myStencil.HaloExchange(Foo,comm_buf);
Bar = Cshift(Foo,dir,disp);
// Implement a stencil code that should agree with cshift!
for(int i=0;i<Check._grid->oSites();i++){
int offset = myStencil._offsets [0][i];
int local = myStencil._is_local[0][i];
int permute_type = myStencil._permute_type[0];
int perm =myStencil._permute[0][i];
if ( local && perm )
permute(Check._odata[i],Foo._odata[offset],permute_type);
else if (local)
Check._odata[i] = Foo._odata[offset];
else
Check._odata[i] = comm_buf[offset];
}
Real nrmC = norm2(Check);
Real nrmB = norm2(Bar);
Real nrm = norm2(Check-Bar);
printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout);
Real snrmC =0;
Real snrmB =0;
Real snrm =0;
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
Complex diff;
ColourMatrix check,bar;
peekSite(check,Check,coor);
peekSite(bar,Bar,coor);
for(int r=0;r<3;r++){
for(int c=0;c<3;c++){
diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
double nn=real(conj(diff)*diff);
if ( nn > 0){
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
coor[0],coor[1],coor[2],coor[3],r,c,
nn,
real(check._internal._internal[r][c]),
real(bar._internal._internal[r][c])
);
}
snrmC=snrmC+real(conj(check._internal._internal[r][c])*check._internal._internal[r][c]);
snrmB=snrmB+real(conj(bar._internal._internal[r][c])*bar._internal._internal[r][c]);
snrm=snrm+nn;
}}
}}}}
printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout);
}
}
Grid_finalize();
}