mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Stencil code pretty much shaken out.
Beginning of inner product and norm2.
This commit is contained in:
parent
977c7721d5
commit
2d54ef2a52
1
Grid.h
1
Grid.h
@ -47,6 +47,7 @@
|
||||
#include <Grid_Cartesian.h>
|
||||
#include <Grid_Lattice.h>
|
||||
#include <Grid_comparison.h>
|
||||
#include <Grid_stencil.h>
|
||||
#include <Grid_QCD.h>
|
||||
|
||||
namespace Grid {
|
||||
|
@ -407,6 +407,18 @@ public:
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline RealD norm2(const Lattice<vobj> &arg){
|
||||
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
|
||||
|
||||
scalar nrm;
|
||||
for(int ss=0;ss<arg._grid->oSites(); ss++){
|
||||
vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
|
||||
}
|
||||
nrm = Reduce(TensorRemove(vnrm));
|
||||
return real(nrm);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -11,10 +11,10 @@
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
/* #undef GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
#define GRID_COMMS_NONE 1
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
|
@ -8,6 +8,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
@ -18,6 +19,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
||||
#pragma omp parallel for collapse(2)
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
|
||||
buffer[bo++]=rhs._odata[so+o+b];
|
||||
}
|
||||
o +=rhs._grid->_slice_stride[dimension];
|
||||
|
15
Grid_main.cc
15
Grid_main.cc
@ -1,10 +1,4 @@
|
||||
#include "Grid.h"
|
||||
#include "Grid_vRealD.h"
|
||||
#include "Grid_vRealF.h"
|
||||
#include "Grid_vComplexD.h"
|
||||
#include "Grid_vComplexF.h"
|
||||
#include "Grid_Cartesian.h"
|
||||
#include "Grid_Lattice.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
@ -16,13 +10,14 @@ int main (int argc, char ** argv)
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size(4);
|
||||
|
||||
std::vector<int> simd_layout(4);
|
||||
|
||||
std::vector<int> mpi_layout(4);
|
||||
mpi_layout[0]=1;
|
||||
mpi_layout[1]=1;
|
||||
mpi_layout[2]=1;
|
||||
mpi_layout[3]=1;
|
||||
mpi_layout[0]=2;
|
||||
mpi_layout[1]=2;
|
||||
mpi_layout[2]=2;
|
||||
mpi_layout[3]=2;
|
||||
|
||||
#ifdef AVX512
|
||||
for(int omp=128;omp<236;omp+=16){
|
||||
|
@ -922,9 +922,10 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
|
||||
{
|
||||
typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
|
||||
iScalar<ret_t> ret=zero;
|
||||
iScalar<ret_t> tmp;
|
||||
for(int c1=0;c1<N;c1++){
|
||||
for(int c2=0;c2<N;c2++){
|
||||
ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
||||
ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
|
||||
}}
|
||||
return ret;
|
||||
}
|
||||
|
53
Grid_simd.h
53
Grid_simd.h
@ -26,11 +26,10 @@ namespace Grid {
|
||||
|
||||
typedef float RealF;
|
||||
typedef double RealD;
|
||||
typedef RealF Real;
|
||||
|
||||
typedef std::complex<RealF> ComplexF;
|
||||
typedef std::complex<RealD> ComplexD;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
|
||||
inline RealF adj(const RealF & r){ return r; }
|
||||
inline RealF conj(const RealF & r){ return r; }
|
||||
@ -54,7 +53,7 @@ namespace Grid {
|
||||
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline Complex adj(const Complex& r ){ return(conj(r)); }
|
||||
inline ComplexF adj(const ComplexF& r ){ return(conj(r)); }
|
||||
//conj already supported for complex
|
||||
|
||||
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
|
||||
@ -232,4 +231,52 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
|
||||
#include <Grid_vComplexF.h>
|
||||
#include <Grid_vComplexD.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// NB: Template the following on "type Complex" and then implement *,+,- for
|
||||
// ComplexF, ComplexD, RealF, RealD above to
|
||||
// get full generality of binops with scalars.
|
||||
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
// Default precision
|
||||
typedef RealD Real;
|
||||
typedef std::complex<Real> Complex;
|
||||
|
||||
typedef vRealD vReal;
|
||||
typedef vComplexD vComplex;
|
||||
}
|
||||
#endif
|
||||
|
572
Grid_stencil.h
572
Grid_stencil.h
@ -1,11 +1,15 @@
|
||||
#ifndef GRID_STENCIL_H
|
||||
#define GRID_STENCIL_H
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so need
|
||||
// additional stencil support.
|
||||
//
|
||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||
// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
|
||||
// This will be done with generality to allow easier efficient implementations.
|
||||
// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
|
||||
// and
|
||||
//
|
||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||
//
|
||||
@ -35,329 +39,313 @@
|
||||
|
||||
namespace Grid {
|
||||
|
||||
class Stencil {
|
||||
struct CommsRequest {
|
||||
int words;
|
||||
int unified_buffer_offset;
|
||||
int tag;
|
||||
int to_rank;
|
||||
int from_rank;
|
||||
} ;
|
||||
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
Stencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
std::vector<int> directions,
|
||||
std::vector<int> distances);
|
||||
|
||||
void Stencil_local (int dimension,int shift,int cbmask);
|
||||
void Stencil_comms (int dimension,int shift,int cbmask);
|
||||
void Stencil_comms_simd(int dimension,int shift,int cbmask);
|
||||
|
||||
// Will need to implement actions for
|
||||
Copy_plane;
|
||||
Copy_plane_permute;
|
||||
Gather_plane;
|
||||
|
||||
// The offsets to all neibours in stencil in each direction
|
||||
int _checkerboard;
|
||||
int _npoints; // Move to template param?
|
||||
GridBase * _grid;
|
||||
|
||||
// Store these as SIMD Integer needed
|
||||
//
|
||||
// std::vector< iVector<Integer, Npoint> > _offsets;
|
||||
// std::vector< iVector<Integer, Npoint> > _local;
|
||||
// std::vector< iVector<Integer, Npoint> > _comm_buf_size;
|
||||
// std::vector< iVector<Integer, Npoint> > _permute;
|
||||
|
||||
std::vector<std::vector<int> > _offsets;
|
||||
std::vector<std::vector<int> > _local;
|
||||
|
||||
// npoints of these
|
||||
std::vector<int> _directions;
|
||||
std::vector<int> _distances;
|
||||
std::vector<int> _comm_buf_size;
|
||||
std::vector<int> _permute;
|
||||
std::vector<int> _permute_type;
|
||||
|
||||
};
|
||||
// npoints x Osites() of these
|
||||
std::vector<std::vector<int> > _offsets;
|
||||
std::vector<std::vector<int> > _is_local;
|
||||
std::vector<std::vector<int> > _permute;
|
||||
|
||||
Stencil::Stencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
std::vector<int> directions,
|
||||
std::vector<int> distances){
|
||||
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
int _unified_buffer_size;
|
||||
int _request_count;
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
std::vector<CommsRequest> CommsRequests;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances);
|
||||
|
||||
_checkerboard = checkerboard;
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
||||
// Can this be avoided with simpler coding of comms?
|
||||
void Local (int point, int dimension,int shift,int cbmask);
|
||||
void Comms (int point, int dimension,int shift,int cbmask);
|
||||
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
|
||||
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
|
||||
|
||||
int sshift[2];
|
||||
// Could allow a functional munging of the halo to another type during the comms.
|
||||
// this could implement the 16bit/32bit/64bit compression.
|
||||
template<class vobj> void HaloExchange(Lattice<vobj> &source,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
|
||||
{
|
||||
// conformable(source._grid,_grid);
|
||||
assert(source._grid==_grid);
|
||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||
int u_comm_offset=0;
|
||||
|
||||
if ( !comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
// Gather all comms buffers
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Stencil_local(dimension,shift,0x3);
|
||||
} else {
|
||||
Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else if ( splice_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
for(int point = 0 ; point < _npoints; point++) {
|
||||
|
||||
printf("Point %d \n",point);fflush(stdout);
|
||||
int dimension = _directions[point];
|
||||
int displacement = _distances[point];
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Stencil_comms_simd(dimension,shift,0x3);
|
||||
} else {
|
||||
Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else {
|
||||
// Cshift_comms(ret,rhs,dimension,shift);
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Stencil_comms(dimension,shift,0x3);
|
||||
} else {
|
||||
Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
int shift = (displacement+fd)%fd;
|
||||
|
||||
int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
|
||||
assert (checkerboard== _checkerboard);
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
// Gather phase
|
||||
int sshift [2];
|
||||
if ( comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
if (splice_dim) {
|
||||
printf("splice 0x3 \n");fflush(stdout);
|
||||
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
|
||||
} else {
|
||||
printf("NO splice 0x3 \n");fflush(stdout);
|
||||
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
|
||||
}
|
||||
} else {
|
||||
if(splice_dim){
|
||||
printf("splice 0x1,2 \n");fflush(stdout);
|
||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
|
||||
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
|
||||
} else {
|
||||
printf("NO splice 0x1,2 \n");fflush(stdout);
|
||||
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
|
||||
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Stencil::Stencil_local (int dimension,int shift,int cbmask)
|
||||
{
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
// the permute type
|
||||
int permute_dim =_grid->PermuteDim(dimension);
|
||||
int permute_type=_grid->PermuteType(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
|
||||
int &u_comm_offset)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int o = 0;
|
||||
int bo = x * _grid->_ostride[dimension];
|
||||
GridBase *grid=_grid;
|
||||
assert(rhs._grid==_grid);
|
||||
// conformable(_grid,rhs._grid);
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
|
||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
|
||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane (dimension,x,sx,cbmask);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void Stencil::Stencil_comms (int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=_grid;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||
// FIXME: Do something with buffer_size??
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (!offnode) {
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
Copy_plane(dimension,x,sx,cbmask);
|
||||
printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
} else {
|
||||
|
||||
int words = send_buf.size();
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
Gather_plane_simple (dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
/*
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
*/
|
||||
Scatter_plane_simple (dimension,x,cbmask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=_grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// FIXME do something with buffer size
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd); //
|
||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||
std::vector<int> icoor(grid->Nd());
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int comm_any = 0;
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
int shifted_x = x+s*rd+sshift;
|
||||
comm_offnode[s] = shifted_x >= ld;
|
||||
comm_any = comm_any | comm_offnode[s];
|
||||
comm_proc[s] = shifted_x/ld;
|
||||
}
|
||||
|
||||
int o = 0;
|
||||
int bo = x*grid->_ostride[dimension];
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
if ( comm_any ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
}
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if (offnode) {
|
||||
|
||||
int s;
|
||||
grid->iCoorFromIindex(icoor,i);
|
||||
s = icoor[dimension];
|
||||
printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
|
||||
int words = send_buf.size();
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
|
||||
printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
if(comm_offnode[s]){
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
|
||||
|
||||
/*
|
||||
grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
*/
|
||||
|
||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
|
||||
|
||||
printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
||||
}
|
||||
|
||||
u_comm_offset+=buffer_size;
|
||||
printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
|
||||
}
|
||||
|
||||
// Permute by swizzling pointers in merge
|
||||
int permute_slice=0;
|
||||
int lshift=sshift%ld;
|
||||
int wrap =lshift/rd;
|
||||
int num =lshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||
int PermuteMap;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( permute_slice ) {
|
||||
PermuteMap=i^toggle_bit;
|
||||
pointers[i] = rpointers[PermuteMap];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
}
|
||||
|
||||
Scatter_plane_merge(pointers,dimension,x,cbmask);
|
||||
|
||||
} else {
|
||||
|
||||
int permute_slice=0;
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
template<class vobj>
|
||||
void GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
|
||||
int &u_comm_offset)
|
||||
{
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=_grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
/* FIXME ALTERNATE BUFFER DETERMINATION */
|
||||
std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
|
||||
int bytes = buffer_size*words*sizeof(scalar_type);
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd); //
|
||||
std::vector<scalar_type *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
std::vector<int> comm_offnode(simd_layout);
|
||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||
std::vector<int> icoor(_grid->Nd());
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int comm_any = 0;
|
||||
for(int s=0;s<simd_layout;s++) {
|
||||
int shifted_x = x+s*rd+sshift;
|
||||
comm_offnode[s] = shifted_x >= ld;
|
||||
comm_any = comm_any | comm_offnode[s];
|
||||
comm_proc[s] = shifted_x/ld;
|
||||
}
|
||||
|
||||
int o = 0;
|
||||
int bo = x*_grid->_ostride[dimension];
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
if ( comm_any ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
}
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int s;
|
||||
_grid->iCoorFromIindex(icoor,i);
|
||||
s = icoor[dimension];
|
||||
|
||||
if(comm_offnode[s]){
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
_grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
_grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Permute by swizzling pointers in merge
|
||||
int permute_slice=0;
|
||||
int lshift=sshift%ld;
|
||||
int wrap =lshift/rd;
|
||||
int num =lshift%rd;
|
||||
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
|
||||
int toggle_bit = (Nsimd>>(permute_type+1));
|
||||
int PermuteMap;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( permute_slice ) {
|
||||
PermuteMap=i^toggle_bit;
|
||||
pointers[i] = rpointers[PermuteMap];
|
||||
} else {
|
||||
pointers[i] = rpointers[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Here we don't want to scatter, just place into a buffer.
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
merge(u_comm_buf[u_comm_offset+i],pointers);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
258
Grid_stencil_common.cc
Normal file
258
Grid_stencil_common.cc
Normal file
@ -0,0 +1,258 @@
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
CartesianStencil::CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: _offsets(npoints),
|
||||
_is_local(npoints),
|
||||
_comm_buf_size(npoints),
|
||||
_permute_type(npoints),
|
||||
_permute(npoints)
|
||||
{
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
_directions = directions;
|
||||
_distances = distances;
|
||||
_unified_buffer_size=0;
|
||||
_request_count =0;
|
||||
CommsRequests.resize(0);
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
|
||||
int point = i;
|
||||
|
||||
_offsets[i].resize( osites);
|
||||
_is_local[i].resize(osites);
|
||||
_permute[i].resize( osites);
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
int shift = displacement;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
_permute_type[point]=_grid->PermuteType(dimension);
|
||||
|
||||
_checkerboard = checkerboard;
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int sshift[2];
|
||||
|
||||
// Underlying approach. For each local site build
|
||||
// up a table containing the npoint "neighbours" and whether they
|
||||
// live in lattice or a comms buffer.
|
||||
if ( !comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Local(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else { // All permute extract done in comms phase prior to Stencil application
|
||||
// So tables are the same whether comm_dim or splice_dim
|
||||
sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
|
||||
sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Comms(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CartesianStencil::Local (int point, int dimension,int shift,int cbmask)
|
||||
{
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
shift = (shift+fd)%fd;
|
||||
|
||||
// the permute type
|
||||
int permute_dim =_grid->PermuteDim(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * _grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
|
||||
int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianStencil::Comms (int point,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||
// send to one or more remote nodes.
|
||||
|
||||
int cb= (cbmask==0x2)? 1 : 0;
|
||||
int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int offnode = ( x+sshift >= rd );
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = (x+sshift)/rd;
|
||||
|
||||
if (!offnode) {
|
||||
|
||||
int permute_slice=0;
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
CommsRequest cr;
|
||||
|
||||
cr.tag = _request_count++;
|
||||
cr.words = words;
|
||||
cr.unified_buffer_offset = _unified_buffer_size;
|
||||
_unified_buffer_size += words;
|
||||
grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
|
||||
|
||||
CommsRequests.push_back(cr);
|
||||
|
||||
ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_offsets [point][lo+o+b]=ro+o+b;
|
||||
_is_local[point][lo+o+b]=1;
|
||||
_permute [point][lo+o+b]=permute;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
_offsets [point][lo+o+b]=ro+o+b;
|
||||
_is_local[point][lo+o+b]=1;
|
||||
_permute [point][lo+o+b]=permute;
|
||||
}
|
||||
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_offsets [point][so+o+b]=offset+(bo++);
|
||||
_is_local[point][so+o+b]=0;
|
||||
_permute [point][so+o+b]=0;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
_offsets [point][so+o+b]=offset+(bo++);
|
||||
_is_local[point][so+o+b]=0;
|
||||
_permute [point][so+o+b]=0;
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -194,6 +194,8 @@ namespace Grid {
|
||||
float b= imag(c);
|
||||
vsplat(ret,a,b);
|
||||
}
|
||||
|
||||
|
||||
friend inline void vsplat(vComplexD &ret,double rl,double ig){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_pd(ig,rl,ig,rl);
|
||||
@ -321,13 +323,14 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
inline vComplex trace(const vComplex &arg){
|
||||
inline vComplexD trace(const vComplexD &arg){
|
||||
return arg;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
//// Generic routine to promote object<complex> -> object<vcomplex>
|
||||
//// Supports the array reordering transformation that gives me SIMD utilisation
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
template<template<class> class object>
|
||||
inline object<vComplex> splat(object<Complex >s){
|
||||
object<vComplex> ret;
|
||||
@ -338,6 +341,6 @@ inline object<vComplex> splat(object<Complex >s){
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
@ -3,6 +3,16 @@
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
/*
|
||||
inline void Print(const char *A,cvec c) {
|
||||
float *fp=(float *)&c;
|
||||
printf(A);
|
||||
printf(" %le %le %le %le %le %le %le %le\n",
|
||||
fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
|
||||
}
|
||||
*/
|
||||
|
||||
class vComplexF {
|
||||
// protected:
|
||||
|
||||
@ -108,7 +118,7 @@ namespace Grid {
|
||||
ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
ret.v= _mm256_addsub_ps(ymm0,ymm1); // FIXME -- AVX2 could MAC
|
||||
ret.v= _mm256_addsub_ps(ymm0,ymm1);
|
||||
#endif
|
||||
#ifdef SSE2
|
||||
cvec ymm0,ymm1,ymm2;
|
||||
@ -142,7 +152,7 @@ namespace Grid {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FIXME: gonna remove these load/store, get, set, prefetch
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
friend inline void vset(vComplexF &ret, Complex *a){
|
||||
friend inline void vset(vComplexF &ret, ComplexF *a){
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
#endif
|
||||
@ -216,12 +226,12 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
||||
#endif
|
||||
}
|
||||
|
||||
friend inline vComplexF operator * (const Complex &a, vComplexF b){
|
||||
friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
|
||||
vComplexF va;
|
||||
vsplat(va,a);
|
||||
return va*b;
|
||||
}
|
||||
friend inline vComplexF operator * (vComplexF b,const Complex &a){
|
||||
friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
|
||||
return a*b;
|
||||
}
|
||||
|
||||
@ -283,22 +293,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
||||
}
|
||||
*/
|
||||
|
||||
// NB: Template the following on "type Complex" and then implement *,+,- for
|
||||
// ComplexF, ComplexD, RealF, RealD above to
|
||||
// get full generality of binops with scalars.
|
||||
friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
friend inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const Complex *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
friend inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Conjugate
|
||||
///////////////////////
|
||||
|
||||
friend inline vComplexF conj(const vComplexF &in){
|
||||
vComplexF ret ; vzero(ret);
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
@ -363,10 +362,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
|
||||
|
||||
};
|
||||
|
||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
|
||||
inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)
|
||||
{
|
||||
return conj(l)*r;
|
||||
}
|
||||
|
||||
typedef vComplexF vFComplex;
|
||||
typedef vComplexF vComplex;
|
||||
inline void zeroit(vComplexF &z){ vzero(z);}
|
||||
|
||||
inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
|
||||
|
33
Makefile.am
33
Makefile.am
@ -1,11 +1,22 @@
|
||||
# additional include paths necessary to compile the C++ library
|
||||
AM_CXXFLAGS = -I$(top_srcdir)/
|
||||
|
||||
|
||||
extra_sources=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=Grid_communicator_mpi.cc
|
||||
extra_sources+=Grid_stencil_common.cc
|
||||
endif
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=Grid_communicator_fake.cc
|
||||
extra_sources+=Grid_stencil_common.cc
|
||||
endif
|
||||
|
||||
#
|
||||
# Libraries
|
||||
#
|
||||
lib_LIBRARIES = libGrid.a
|
||||
libGrid_a_SOURCES = Grid_init.cc
|
||||
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
|
||||
|
||||
#
|
||||
# Include files
|
||||
@ -26,24 +37,18 @@ include_HEADERS = Grid_config.h\
|
||||
Grid_cshift_common.h\
|
||||
Grid_cshift_mpi.h\
|
||||
Grid_cshift_none.h\
|
||||
Grid_stencil.h\
|
||||
Grid_math_types.h
|
||||
|
||||
|
||||
#
|
||||
# Test code
|
||||
#
|
||||
bin_PROGRAMS = Grid_main
|
||||
|
||||
extra_sources=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=Grid_communicator_mpi.cc
|
||||
endif
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=Grid_communicator_fake.cc
|
||||
endif
|
||||
|
||||
Grid_main_SOURCES = \
|
||||
Grid_main.cc\
|
||||
$(extra_sources)
|
||||
bin_PROGRAMS = Grid_main test_Grid_stencil
|
||||
|
||||
Grid_main_SOURCES = Grid_main.cc
|
||||
Grid_main_LDADD = libGrid.a
|
||||
|
||||
test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
|
||||
test_Grid_stencil_LDADD = libGrid.a
|
||||
|
||||
|
53
Makefile.in
53
Makefile.in
@ -88,9 +88,11 @@ POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
bin_PROGRAMS = Grid_main$(EXEEXT)
|
||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
|
||||
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc \
|
||||
@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.cc
|
||||
@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc \
|
||||
@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.cc
|
||||
bin_PROGRAMS = Grid_main$(EXEEXT) test_Grid_stencil$(EXEEXT)
|
||||
subdir = .
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
||||
@ -142,18 +144,23 @@ am__v_AR_0 = @echo " AR " $@;
|
||||
am__v_AR_1 =
|
||||
libGrid_a_AR = $(AR) $(ARFLAGS)
|
||||
libGrid_a_LIBADD =
|
||||
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
|
||||
am__libGrid_a_SOURCES_DIST = Grid_init.cc Grid_communicator_mpi.cc \
|
||||
Grid_stencil_common.cc Grid_communicator_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT) \
|
||||
@BUILD_COMMS_MPI_TRUE@ Grid_stencil_common.$(OBJEXT)
|
||||
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
|
||||
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT) \
|
||||
@BUILD_COMMS_NONE_TRUE@ Grid_stencil_common.$(OBJEXT)
|
||||
am__objects_3 = $(am__objects_1) $(am__objects_2)
|
||||
am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT) $(am__objects_3)
|
||||
libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
|
||||
PROGRAMS = $(bin_PROGRAMS)
|
||||
am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
|
||||
Grid_communicator_fake.cc
|
||||
@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
|
||||
@BUILD_COMMS_NONE_TRUE@am__objects_2 = \
|
||||
@BUILD_COMMS_NONE_TRUE@ Grid_communicator_fake.$(OBJEXT)
|
||||
am__objects_3 = $(am__objects_1) $(am__objects_2)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
|
||||
am_Grid_main_OBJECTS = Grid_main.$(OBJEXT)
|
||||
Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
|
||||
Grid_main_DEPENDENCIES = libGrid.a
|
||||
am_test_Grid_stencil_OBJECTS = test_Grid_Stencil.$(OBJEXT)
|
||||
test_Grid_stencil_OBJECTS = $(am_test_Grid_stencil_OBJECTS)
|
||||
test_Grid_stencil_DEPENDENCIES = libGrid.a
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
@ -183,8 +190,10 @@ AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
|
||||
am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
|
||||
am__v_CXXLD_0 = @echo " CXXLD " $@;
|
||||
am__v_CXXLD_1 =
|
||||
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
|
||||
DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
|
||||
SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES) \
|
||||
$(test_Grid_stencil_SOURCES)
|
||||
DIST_SOURCES = $(am__libGrid_a_SOURCES_DIST) $(Grid_main_SOURCES) \
|
||||
$(test_Grid_stencil_SOURCES)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
@ -329,12 +338,13 @@ top_srcdir = @top_srcdir@
|
||||
|
||||
# additional include paths necessary to compile the C++ library
|
||||
AM_CXXFLAGS = -I$(top_srcdir)/
|
||||
extra_sources = $(am__append_1) $(am__append_2)
|
||||
|
||||
#
|
||||
# Libraries
|
||||
#
|
||||
lib_LIBRARIES = libGrid.a
|
||||
libGrid_a_SOURCES = Grid_init.cc
|
||||
libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
|
||||
|
||||
#
|
||||
# Include files
|
||||
@ -355,14 +365,13 @@ include_HEADERS = Grid_config.h\
|
||||
Grid_cshift_common.h\
|
||||
Grid_cshift_mpi.h\
|
||||
Grid_cshift_none.h\
|
||||
Grid_stencil.h\
|
||||
Grid_math_types.h
|
||||
|
||||
extra_sources = $(am__append_1) $(am__append_2)
|
||||
Grid_main_SOURCES = \
|
||||
Grid_main.cc\
|
||||
$(extra_sources)
|
||||
|
||||
Grid_main_SOURCES = Grid_main.cc
|
||||
Grid_main_LDADD = libGrid.a
|
||||
test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
|
||||
test_Grid_stencil_LDADD = libGrid.a
|
||||
all: Grid_config.h
|
||||
$(MAKE) $(AM_MAKEFLAGS) all-am
|
||||
|
||||
@ -499,6 +508,10 @@ Grid_main$(EXEEXT): $(Grid_main_OBJECTS) $(Grid_main_DEPENDENCIES) $(EXTRA_Grid_
|
||||
@rm -f Grid_main$(EXEEXT)
|
||||
$(AM_V_CXXLD)$(CXXLINK) $(Grid_main_OBJECTS) $(Grid_main_LDADD) $(LIBS)
|
||||
|
||||
test_Grid_stencil$(EXEEXT): $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_DEPENDENCIES) $(EXTRA_test_Grid_stencil_DEPENDENCIES)
|
||||
@rm -f test_Grid_stencil$(EXEEXT)
|
||||
$(AM_V_CXXLD)$(CXXLINK) $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_LDADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
@ -509,6 +522,8 @@ distclean-compile:
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_stencil_common.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_Grid_Stencil.Po@am__quote@
|
||||
|
||||
.cc.o:
|
||||
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
|
2
TODO
2
TODO
@ -3,7 +3,7 @@
|
||||
* Replace vset with a call to merge.;
|
||||
* care in Gmerge,Gextract over set.
|
||||
* Const audit
|
||||
* extract / merge extra implementation removal
|
||||
* extract / merge extra implementation removal
|
||||
|
||||
|
||||
* Conditional execution, where etc... -----DONE, simple test
|
||||
|
206
test_Grid_jacobi.cc
Normal file
206
test_Grid_jacobi.cc
Normal file
@ -0,0 +1,206 @@
|
||||
#include "Grid.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
template<class vobj>
|
||||
class LinearOperator {
|
||||
public:
|
||||
operator () (const Lattice<vobj>&src,Lattice<vobj> &result) {};
|
||||
};
|
||||
|
||||
template<class vobj>
|
||||
class LinearOperatorJacobi : public LinearOperator<vobj>
|
||||
{
|
||||
CartesianStencil *Stencil;
|
||||
GridBase *_grid;
|
||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf;
|
||||
LinearOperatorJacobi(GridCartesian *grid)
|
||||
{
|
||||
_grid = grid;
|
||||
int npoint=9;
|
||||
std::vector<int> directions(npoint);
|
||||
std::vector<int> displacements(npoint);
|
||||
for(int mu=0;mu<4;mu++){
|
||||
for(int mp=0;mp<2;mp++){
|
||||
int dir = 2*mu+mp;
|
||||
directions[dir] = mu;
|
||||
displacements[dir]= -1+2*mp;
|
||||
}
|
||||
}
|
||||
directions[8] = 0;
|
||||
displacements[8] = 0;
|
||||
|
||||
Stencil = new CartesianStencil(grid,npoint,0,directions,displacements);
|
||||
|
||||
comm_buf.resize(Stencil->_unified_buffer_size);
|
||||
|
||||
}
|
||||
operator () (const Lattice<vobj>&src,Lattice<vobj> &result)
|
||||
{
|
||||
const int npoint=9;
|
||||
printf("calling halo exchange\n");fflush(stdout);
|
||||
myStencil.HaloExchange(Foo,comm_buf);
|
||||
vobj tmp;
|
||||
vobj
|
||||
for(int i=0;i<_grid->oSites();i++){
|
||||
for(int p=0;p<npoint;p++){
|
||||
|
||||
int offset = Stencil->_offsets [p][i];
|
||||
int local = Stencil->_is_local[p][i];
|
||||
int ptype = Stencil->_permute_type[p];
|
||||
int perm = Stencil->_permute[0][i];
|
||||
|
||||
vobj *nbr;
|
||||
if ( local && perm ){
|
||||
permute(tmp,src._odata[offset],ptype);
|
||||
nbr = &tmp;
|
||||
} else if (local) {
|
||||
nbr = &src._odata[offset];
|
||||
} else {
|
||||
nbr = &comm_buf[offset];
|
||||
}
|
||||
result[i] = result[i]+*nbr;
|
||||
}
|
||||
}
|
||||
}
|
||||
~LinearOperatorJacobi()
|
||||
{
|
||||
delete Stencil;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size (4);
|
||||
std::vector<int> simd_layout(4);
|
||||
std::vector<int> mpi_layout (4);
|
||||
|
||||
int omp=1;
|
||||
int lat=8;
|
||||
|
||||
mpi_layout[0]=1;
|
||||
mpi_layout[1]=2;
|
||||
mpi_layout[2]=1;
|
||||
mpi_layout[3]=1;
|
||||
|
||||
latt_size[0] = lat;
|
||||
latt_size[1] = lat;
|
||||
latt_size[2] = lat;
|
||||
latt_size[3] = lat;
|
||||
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||
|
||||
#ifdef AVX512
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 2;
|
||||
simd_layout[2] = 2;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 1;
|
||||
simd_layout[2] = 2;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
#if defined (SSE2)
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 1;
|
||||
simd_layout[2] = 1;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
LatticeColourMatrix Foo(&Fine);
|
||||
LatticeColourMatrix Bar(&Fine);
|
||||
LatticeColourMatrix Check(&Fine);
|
||||
LatticeColourMatrix Diff(&Fine);
|
||||
|
||||
random(Foo);
|
||||
gaussian(Bar);
|
||||
|
||||
|
||||
for(int dir=0;dir<4;dir++){
|
||||
for(int disp=0;disp<Fine._rdimensions[dir];disp++){
|
||||
|
||||
// start to test the Cartesian npoint stencil infrastructure
|
||||
int npoint=;
|
||||
std::vector<int> directions(npoint,dir);
|
||||
std::vector<int> displacements(npoint,disp);
|
||||
|
||||
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
|
||||
|
||||
printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
|
||||
std::vector<int> ocoor(4);
|
||||
for(int o=0;o<Fine.oSites();o++){
|
||||
Fine.oCoorFromOindex(ocoor,o);
|
||||
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
||||
int nbr = Fine.oIndexReduced(ocoor);
|
||||
int stcl= myStencil._offsets[0][o];
|
||||
if(nbr!=stcl){
|
||||
printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
|
||||
}
|
||||
}
|
||||
|
||||
printf("allocating %d buffers\n",myStencil._unified_buffer_size);
|
||||
fflush(stdout);
|
||||
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
|
||||
printf("calling halo exchange\n");fflush(stdout);
|
||||
myStencil.HaloExchange(Foo,comm_buf);
|
||||
|
||||
Bar = Cshift(Foo,dir,disp);
|
||||
|
||||
// Implement a stencil code that should agree with cshift!
|
||||
for(int i=0;i<Check._grid->oSites();i++){
|
||||
|
||||
int offset = myStencil._offsets [0][i];
|
||||
int local = myStencil._is_local[0][i];
|
||||
int permute_type = myStencil._permute_type[0];
|
||||
int perm =myStencil._permute[0][i];
|
||||
if ( local && perm )
|
||||
permute(Check._odata[i],Foo._odata[offset],permute_type);
|
||||
else if (local)
|
||||
Check._odata[i] = Foo._odata[offset];
|
||||
else
|
||||
Check._odata[i] = comm_buf[offset];
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> coor(4);
|
||||
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
|
||||
|
||||
Complex diff;
|
||||
ColourMatrix check,bar;
|
||||
peekSite(check,Check,coor);
|
||||
peekSite(bar,Bar,coor);
|
||||
|
||||
for(int r=0;r<3;r++){
|
||||
for(int c=0;c<3;c++){
|
||||
diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
|
||||
double nn=real(conj(diff)*diff);
|
||||
if ( nn > 0 ){
|
||||
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
|
||||
coor[0],coor[1],coor[2],coor[3],r,c,
|
||||
nn,
|
||||
real(check._internal._internal[r][c]),
|
||||
real(bar._internal._internal[r][c])
|
||||
);
|
||||
}
|
||||
}}
|
||||
|
||||
}}}}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
154
test_Grid_stencil.cc
Normal file
154
test_Grid_stencil.cc
Normal file
@ -0,0 +1,154 @@
|
||||
#include "Grid.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size (4);
|
||||
std::vector<int> simd_layout(4);
|
||||
std::vector<int> mpi_layout (4);
|
||||
|
||||
int omp=1;
|
||||
int lat=8;
|
||||
|
||||
mpi_layout[0]=1;
|
||||
mpi_layout[1]=1;
|
||||
mpi_layout[2]=1;
|
||||
mpi_layout[3]=1;
|
||||
|
||||
latt_size[0] = lat;
|
||||
latt_size[1] = lat;
|
||||
latt_size[2] = lat;
|
||||
latt_size[3] = lat;
|
||||
double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||
|
||||
#ifdef AVX512
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 1;
|
||||
simd_layout[2] = 2;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 1;
|
||||
simd_layout[2] = 1;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
#if defined (SSE2)
|
||||
simd_layout[0] = 1;
|
||||
simd_layout[1] = 1;
|
||||
simd_layout[2] = 1;
|
||||
simd_layout[3] = 2;
|
||||
#endif
|
||||
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||
|
||||
LatticeColourMatrix Foo(&Fine);
|
||||
LatticeColourMatrix Bar(&Fine);
|
||||
LatticeColourMatrix Check(&Fine);
|
||||
LatticeColourMatrix Diff(&Fine);
|
||||
|
||||
random(Foo);
|
||||
gaussian(Bar);
|
||||
|
||||
|
||||
for(int dir=0;dir<4;dir++){
|
||||
for(int disp=0;disp<Fine._rdimensions[dir];disp++){
|
||||
|
||||
// start to test the Cartesian npoint stencil infrastructure
|
||||
int npoint=1;
|
||||
std::vector<int> directions(npoint,dir);
|
||||
std::vector<int> displacements(npoint,disp);
|
||||
|
||||
CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
|
||||
|
||||
printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
|
||||
std::vector<int> ocoor(4);
|
||||
for(int o=0;o<Fine.oSites();o++){
|
||||
Fine.oCoorFromOindex(ocoor,o);
|
||||
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
||||
int nbr = Fine.oIndexReduced(ocoor);
|
||||
int stcl= myStencil._offsets[0][o];
|
||||
if(nbr!=stcl){
|
||||
printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
|
||||
}
|
||||
}
|
||||
|
||||
printf("allocating %d buffers\n",myStencil._unified_buffer_size);
|
||||
fflush(stdout);
|
||||
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
|
||||
printf("calling halo exchange\n");fflush(stdout);
|
||||
myStencil.HaloExchange(Foo,comm_buf);
|
||||
|
||||
Bar = Cshift(Foo,dir,disp);
|
||||
|
||||
// Implement a stencil code that should agree with cshift!
|
||||
for(int i=0;i<Check._grid->oSites();i++){
|
||||
|
||||
int offset = myStencil._offsets [0][i];
|
||||
int local = myStencil._is_local[0][i];
|
||||
int permute_type = myStencil._permute_type[0];
|
||||
int perm =myStencil._permute[0][i];
|
||||
if ( local && perm )
|
||||
permute(Check._odata[i],Foo._odata[offset],permute_type);
|
||||
else if (local)
|
||||
Check._odata[i] = Foo._odata[offset];
|
||||
else
|
||||
Check._odata[i] = comm_buf[offset];
|
||||
|
||||
|
||||
}
|
||||
|
||||
Real nrmC = norm2(Check);
|
||||
Real nrmB = norm2(Bar);
|
||||
Real nrm = norm2(Check-Bar);
|
||||
printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout);
|
||||
|
||||
Real snrmC =0;
|
||||
Real snrmB =0;
|
||||
Real snrm =0;
|
||||
|
||||
std::vector<int> coor(4);
|
||||
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
|
||||
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
|
||||
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
|
||||
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
|
||||
|
||||
Complex diff;
|
||||
ColourMatrix check,bar;
|
||||
peekSite(check,Check,coor);
|
||||
peekSite(bar,Bar,coor);
|
||||
|
||||
for(int r=0;r<3;r++){
|
||||
for(int c=0;c<3;c++){
|
||||
diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
|
||||
double nn=real(conj(diff)*diff);
|
||||
if ( nn > 0){
|
||||
printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
|
||||
coor[0],coor[1],coor[2],coor[3],r,c,
|
||||
nn,
|
||||
real(check._internal._internal[r][c]),
|
||||
real(bar._internal._internal[r][c])
|
||||
);
|
||||
}
|
||||
snrmC=snrmC+real(conj(check._internal._internal[r][c])*check._internal._internal[r][c]);
|
||||
snrmB=snrmB+real(conj(bar._internal._internal[r][c])*bar._internal._internal[r][c]);
|
||||
snrm=snrm+nn;
|
||||
}}
|
||||
|
||||
}}}}
|
||||
|
||||
printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
Loading…
Reference in New Issue
Block a user