mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Clean up but no major changes
This commit is contained in:
parent
d081715504
commit
d5eee231e0
@ -201,10 +201,8 @@ public:
|
|||||||
block = block*_rdimensions[d];
|
block = block*_rdimensions[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( _isites != vComplex::Nsimd()) {
|
assert( _isites == vComplex::Nsimd());
|
||||||
printf("bad layout for grid isites %d Nsimd %d\n",_isites,vComplex::Nsimd());
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -235,10 +233,8 @@ public:
|
|||||||
int ocb=CheckerBoardFromOsite(osite);
|
int ocb=CheckerBoardFromOsite(osite);
|
||||||
|
|
||||||
if ( (source_cb+ocb)&1 ) {
|
if ( (source_cb+ocb)&1 ) {
|
||||||
printf("Checkerboard shift %d\n",(shift)/2);
|
|
||||||
return (shift)/2;
|
return (shift)/2;
|
||||||
} else {
|
} else {
|
||||||
printf("Checkerboard shift %d\n",(shift+1)/2);
|
|
||||||
return (shift+1)/2;
|
return (shift+1)/2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -314,10 +310,7 @@ public:
|
|||||||
block = block*_rdimensions[d];
|
block = block*_rdimensions[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( _isites != vComplex::Nsimd()) {
|
assert ( _isites == vComplex::Nsimd());
|
||||||
printf("bad layout for grid isites %d Nsimd %d\n",_isites,vComplex::Nsimd());
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
protected:
|
protected:
|
||||||
virtual int oIndex(std::vector<int> &coor)
|
virtual int oIndex(std::vector<int> &coor)
|
||||||
|
@ -30,9 +30,7 @@ public:
|
|||||||
|
|
||||||
Lattice(SimdGrid *grid) : _grid(grid) {
|
Lattice(SimdGrid *grid) : _grid(grid) {
|
||||||
_odata.reserve(_grid->oSites());
|
_odata.reserve(_grid->oSites());
|
||||||
if ( ((uint64_t)&_odata[0])&0xF) {
|
assert((((uint64_t)&_odata[0])&0xF) ==0);
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
checkerboard=0;
|
checkerboard=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,23 +95,22 @@ public:
|
|||||||
template<class sobj>
|
template<class sobj>
|
||||||
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
if ( l.checkerboard != l._grid->CheckerBoard(site)){
|
typedef typename vobj::scalar_type stype;
|
||||||
printf("Poking wrong checkerboard\n");
|
typedef typename vobj::vector_type vtype;
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
assert( l.checkerboard == l._grid->CheckerBoard(site));
|
||||||
|
|
||||||
int o_index = l._grid->oIndex(site);
|
int o_index = l._grid->oIndex(site);
|
||||||
int i_index = l._grid->iIndex(site);
|
int i_index = l._grid->iIndex(site);
|
||||||
|
|
||||||
// BUGGY. This assumes complex real
|
stype *v_ptr = (stype *)&l._odata[o_index];
|
||||||
Real *v_ptr = (Real *)&l._odata[o_index];
|
stype *s_ptr = (stype *)&s;
|
||||||
Real *s_ptr = (Real *)&s;
|
|
||||||
v_ptr = v_ptr + 2*i_index;
|
v_ptr = v_ptr + 2*i_index;
|
||||||
|
|
||||||
for(int i=0;i<sizeof(sobj);i+=2*sizeof(Real)){
|
for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
|
||||||
v_ptr[0] = s_ptr[0];
|
v_ptr[0] = s_ptr[0];
|
||||||
v_ptr[1] = s_ptr[1];
|
v_ptr[1] = s_ptr[1];
|
||||||
v_ptr+=2*vComplex::Nsimd();
|
v_ptr+=2*vtype::Nsimd();
|
||||||
s_ptr+=2;
|
s_ptr+=2;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -124,22 +121,22 @@ public:
|
|||||||
template<class sobj>
|
template<class sobj>
|
||||||
friend void peekSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
friend void peekSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
// FIXME : define exceptions set and throw up.
|
typedef typename vobj::scalar_type stype;
|
||||||
if ( l.checkerboard != l._grid->CheckerBoard(site)){
|
typedef typename vobj::vector_type vtype;
|
||||||
printf("Peeking wrong checkerboard\n");
|
|
||||||
exit(EXIT_FAILURE);
|
assert( l.checkerboard== l._grid->CheckerBoard(site));
|
||||||
}
|
|
||||||
int o_index = l._grid->oIndex(site);
|
int o_index = l._grid->oIndex(site);
|
||||||
int i_index = l._grid->iIndex(site);
|
int i_index = l._grid->iIndex(site);
|
||||||
|
|
||||||
Real *v_ptr = (Real *)&l._odata[o_index];
|
stype *v_ptr = (stype *)&l._odata[o_index];
|
||||||
Real *s_ptr = (Real *)&s;
|
stype *s_ptr = (stype *)&s;
|
||||||
v_ptr = v_ptr + 2*i_index;
|
v_ptr = v_ptr + 2*i_index;
|
||||||
|
|
||||||
for(int i=0;i<sizeof(sobj);i+=2*sizeof(Real)){
|
for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
|
||||||
s_ptr[0] = v_ptr[0];
|
s_ptr[0] = v_ptr[0];
|
||||||
s_ptr[1] = v_ptr[1];
|
s_ptr[1] = v_ptr[1];
|
||||||
v_ptr+=2*vComplex::Nsimd();
|
v_ptr+=2*vtype::Nsimd();
|
||||||
s_ptr+=2;
|
s_ptr+=2;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -291,15 +288,6 @@ public:
|
|||||||
}
|
}
|
||||||
}; // class Lattice
|
}; // class Lattice
|
||||||
|
|
||||||
/* Need to implement the multiplication return type matching S S -> S, S M -> M, M S -> M through
|
|
||||||
all nested possibilities.
|
|
||||||
template<template<class> class lhs,template<class> class rhs>
|
|
||||||
class MultTypeSelector {
|
|
||||||
template<typename vtype> using ltype = lhs
|
|
||||||
typedef lhs type;
|
|
||||||
};
|
|
||||||
*/
|
|
||||||
|
|
||||||
template<class obj1,class obj2>
|
template<class obj1,class obj2>
|
||||||
void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
|
void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
|
||||||
{
|
{
|
||||||
@ -313,28 +301,6 @@ public:
|
|||||||
uint32_t vec_len = lhs._grid->oSites();
|
uint32_t vec_len = lhs._grid->oSites();
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int ss=0;ss<vec_len;ss++){
|
for(int ss=0;ss<vec_len;ss++){
|
||||||
|
|
||||||
const char * ptr =(const char*)&lhs._odata[ss];
|
|
||||||
#ifdef PREFETCH
|
|
||||||
v_prefetch0(sizeof(obj2), ptr);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int i=0;i<sizeof(obj2);i+=64){
|
|
||||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
|
||||||
_mm_prefetch(ptr+i+256,_MM_HINT_T0);
|
|
||||||
}
|
|
||||||
|
|
||||||
ptr =(const char*)&rhs._odata[ss];
|
|
||||||
#ifdef PREFETCH
|
|
||||||
v_prefetch0(sizeof(obj3), ptr);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int i=0;i<sizeof(obj3);i+=64){
|
|
||||||
_mm_prefetch(ptr+i+4096,_MM_HINT_T1);
|
|
||||||
_mm_prefetch(ptr+i+256,_MM_HINT_T0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,17 @@
|
|||||||
#ifndef _GRID_CSHIFT_COMMON_H_
|
#ifndef _GRID_CSHIFT_COMMON_H_
|
||||||
#define _GRID_CSHIFT_COMMON_H_
|
#define _GRID_CSHIFT_COMMON_H_
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Must not lose sight that goal is to be able to construct really efficient
|
||||||
|
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
||||||
|
// additional stencil support.
|
||||||
|
//
|
||||||
|
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
||||||
|
//
|
||||||
|
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
||||||
|
//
|
||||||
|
// Grid could create a neighbour index table for a given stencil.
|
||||||
|
// Could also implement CovariantCshift.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
@ -8,8 +20,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
// printf("Gather plane _simple mask %d\n",cbmask);
|
|
||||||
|
|
||||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||||
|
|
||||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
@ -31,7 +41,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
int o = 0; // relative offset to base within plane
|
int o = 0; // relative offset to base within plane
|
||||||
int bo = 0; // offset in buffer
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
// int jjj=0;
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
@ -39,11 +48,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
|
|||||||
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[bo]=rhs._odata[so+o+b];
|
buffer[bo]=rhs._odata[so+o+b];
|
||||||
// float * ptr = (float *)& rhs._odata[so+o+b];
|
|
||||||
// if( (cbmask!=3)&&(jjj<8)){
|
|
||||||
// printf("Gather_plane_simple %d %le bo %d\n",so+o+b,*ptr,bo);
|
|
||||||
// jjj++;
|
|
||||||
// }
|
|
||||||
bo++;
|
bo++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,7 +219,7 @@ friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int
|
|||||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||||
int o = 0; // relative offset to base within plane
|
int o = 0; // relative offset to base within plane
|
||||||
// int jjj=0;
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||||
@ -224,11 +228,6 @@ friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int
|
|||||||
|
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||||
// float *ptr =(float *) &rhs._odata[ro+o+b];
|
|
||||||
// if((cbmask!=0x3)&&jjj<8) {
|
|
||||||
// printf("Copy_plane %d %le n,b=%d,%d mask %d ocb %d\n",ro+o+b,*ptr,n,b,cbmask,ocb);
|
|
||||||
// jjj++;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,36 +1,10 @@
|
|||||||
#ifndef _GRID_MPI_CSHIFT_H_
|
#ifndef _GRID_MPI_CSHIFT_H_
|
||||||
#define _GRID_MPI_CSHIFT_H_
|
#define _GRID_MPI_CSHIFT_H_
|
||||||
|
|
||||||
|
#ifndef MAX
|
||||||
#define MAX(x,y) ((x)>(y)?(x):(y))
|
#define MAX(x,y) ((x)>(y)?(x):(y))
|
||||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
#endif
|
||||||
// Must not lose sight that goal is to be able to construct really efficient
|
|
||||||
// gather to a point stencil code. CSHIFT is not the best way, so probably need
|
|
||||||
// additional stencil support.
|
|
||||||
//
|
|
||||||
// Could still do a templated syntax tree and make CSHIFT return lattice vector.
|
|
||||||
//
|
|
||||||
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
|
|
||||||
//
|
|
||||||
// Lattice <foo> could also allocate haloes which get used for stencil code.
|
|
||||||
//
|
|
||||||
// Grid could create a neighbour index table for a given stencil.
|
|
||||||
// Could also implement CovariantCshift.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
// Q. Further split this into separate sub functions?
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
// CshiftCB_local
|
|
||||||
// CshiftCB_local_permute
|
|
||||||
|
|
||||||
// Cshift_comms_splice
|
|
||||||
// Cshift_comms
|
|
||||||
// Cshift_local
|
|
||||||
// Cshift_local_permute
|
|
||||||
|
|
||||||
|
|
||||||
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
@ -71,16 +45,10 @@ friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
|
||||||
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
// printf("Cshift_comms : single pass\n");
|
|
||||||
Cshift_comms(ret,rhs,dimension,shift,0x3);
|
Cshift_comms(ret,rhs,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
// printf("Cshift_comms : two pass\n");
|
|
||||||
// printf("call1\n");
|
|
||||||
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
// printf("call2\n");
|
|
||||||
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
// printf("done\n");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,11 +62,8 @@ friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimensio
|
|||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
// printf("call1 0x1 cb=even\n");
|
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
// printf("call2 0x2 cb=odd\n");
|
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
// printf("done\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,13 +85,10 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
// Packed gather sequence is clean
|
|
||||||
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
|
||||||
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
|
||||||
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
|
||||||
|
|
||||||
// This code could be simplified by multiple calls to single routine with extra params to
|
|
||||||
// encapsulate the difference in the code paths.
|
|
||||||
int cb= (cbmask==0x2)? 1 : 0;
|
int cb= (cbmask==0x2)? 1 : 0;
|
||||||
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
@ -137,8 +99,9 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
int comm_proc = (x+sshift)/rd;
|
int comm_proc = (x+sshift)/rd;
|
||||||
|
|
||||||
if (!offnode) {
|
if (!offnode) {
|
||||||
// printf("local x %d sshift %d offnode %d rd %d cb %d\n",x,sshift,offnode,rd,cb);
|
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int words = send_buf.size();
|
int words = send_buf.size();
|
||||||
@ -146,29 +109,13 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
|
|||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
// printf("nonlocal x %d sx %d sshift %d offnode %d rd %d cb %d cbmask %d rhscb %d comm_proc %d\n",
|
|
||||||
// x,sx,sshift,offnode,rd,cb,cbmask,rhs.checkerboard,comm_proc);
|
|
||||||
// Copy_plane(temp,rhs,dimension,x,sx,cbmask);
|
|
||||||
|
|
||||||
// Bug found; cbmask may differ between sx plan and rx plane.
|
|
||||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||||
// for(int i=0;i<MIN(words,8);i++){
|
|
||||||
// float *ptr = (float *)&send_buf[i];
|
|
||||||
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
|
|
||||||
// }
|
|
||||||
// Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask^0x3);
|
|
||||||
// for(int i=0;i<MIN(words,8);i++){
|
|
||||||
// float *ptr = (float *)&send_buf[i];
|
|
||||||
// printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
|
|
||||||
// }
|
|
||||||
// recv_buf=send_buf;
|
|
||||||
|
|
||||||
int rank = grid->_processor;
|
int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
// printf("bytes %d node %d sending to %d receiving from %d\n",bytes,rank,xmit_to_rank,recv_from_rank );
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
@ -224,46 +171,29 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
int cb = (cbmask==0x2)? 1 : 0;
|
int cb = (cbmask==0x2)? 1 : 0;
|
||||||
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
// printf("cshift-comms-simd: shift = %d ; sshift = %d ; cbmask %d ; simd_layout %d\n",shift,sshift,cbmask,simd_layout);
|
|
||||||
std::vector<int> comm_offnode(simd_layout);
|
std::vector<int> comm_offnode(simd_layout);
|
||||||
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
std::vector<int> comm_proc (simd_layout); //relative processor coord in dim=dimension
|
||||||
|
|
||||||
// Strategy
|
|
||||||
//
|
|
||||||
//* Loop over source planes
|
|
||||||
//* if any communication needed extract and send
|
|
||||||
//* if communication needed extract and send
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int comm_any = 0;
|
int comm_any = 0;
|
||||||
for(int s=0;s<simd_layout;s++) {
|
for(int s=0;s<simd_layout;s++) {
|
||||||
// does shift to "neighbour" takes us off node?
|
|
||||||
// coordinates (reduce plane, simd_lane) of neighbour?
|
|
||||||
// how many nodes away is this shift?
|
|
||||||
// where we should send to?
|
|
||||||
// where we should receive from?
|
|
||||||
int shifted_x = x+s*rd+sshift;
|
int shifted_x = x+s*rd+sshift;
|
||||||
comm_offnode[s] = shifted_x >= ld;
|
comm_offnode[s] = shifted_x >= ld;
|
||||||
comm_any = comm_any | comm_offnode[s];
|
comm_any = comm_any | comm_offnode[s];
|
||||||
comm_proc[s] = shifted_x/ld;
|
comm_proc[s] = shifted_x/ld;
|
||||||
// printf("rd %d x %d shifted %d s=%d comm_any %d\n",rd, x,shifted_x,s,comm_any);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int o = 0;
|
int o = 0;
|
||||||
int bo = x*grid->_ostride[dimension];
|
int bo = x*grid->_ostride[dimension];
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
// Need Convenience function in _grid. Move this in
|
|
||||||
if ( comm_any ) {
|
if ( comm_any ) {
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
pointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
// for(int i=0;i<Nsimd;i++){
|
|
||||||
// printf("extracted %d %le\n",i,real(send_buf_extract[i][0]));
|
|
||||||
// }
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
@ -283,14 +213,11 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
// printf("Cshift_simd comms %d %le %le\n",i,real(recv_buf_extract[i][0]),real(send_buf_extract[i][0]));
|
|
||||||
|
|
||||||
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
|
||||||
// printf("Cshift_simd local %d %le \n",i,real(send_buf_extract[i][0]));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,7 +238,6 @@ friend void Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
|
|||||||
} else {
|
} else {
|
||||||
pointers[i] = rpointers[i];
|
pointers[i] = rpointers[i];
|
||||||
}
|
}
|
||||||
// printf("Cshift_simd perm %d num %d wrap %d swiz %d %le unswiz %le\n",permute_slice,num,wrap,i,real(pointers[i][0]),real(rpointers[i][0]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
|
||||||
|
@ -830,7 +830,7 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &
|
|||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Can only take the real/imag part of scalar objects, since
|
// Can only take the real/imag part of scalar objects, since
|
||||||
// lattice objects of different complexity are non-conformable.
|
// lattice objects of different complex nature are non-conformable.
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
|
template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
|
||||||
{
|
{
|
||||||
|
22
Grid_mpi.cc
22
Grid_mpi.cc
@ -17,13 +17,15 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
|
|||||||
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
MPI_Comm_rank(communicator,&_processor);
|
MPI_Comm_rank(communicator,&_processor);
|
||||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||||
printf("Hello world from processor [");
|
|
||||||
for(int i=0;i<_ndimension;i++){
|
for(int i=0;i<_ndimension;i++){
|
||||||
printf("%d ",_processor_coor[i]);
|
|
||||||
_Nprocessors*=_processors[i];
|
_Nprocessors*=_processors[i];
|
||||||
}
|
}
|
||||||
printf("]\n");
|
|
||||||
fflush(stdout);
|
int Size;
|
||||||
|
MPI_Comm_size(communicator,&Size);
|
||||||
|
|
||||||
|
assert(Size==_Nprocessors);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::GlobalSumF(float &f){
|
void CartesianCommunicator::GlobalSumF(float &f){
|
||||||
@ -71,15 +73,3 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
|
|
||||||
// Could possibly do a direct block strided send?
|
|
||||||
int MPI_Type_vector(
|
|
||||||
int count,
|
|
||||||
int blocklength,
|
|
||||||
int stride,
|
|
||||||
MPI_Datatype old_type,
|
|
||||||
MPI_Datatype *newtype_p
|
|
||||||
);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
@ -48,7 +48,6 @@ namespace Grid {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
ret.v = _mm512_add_pd(a.v,b.v);
|
ret.v = _mm512_add_pd(a.v,b.v);
|
||||||
//printf("%s %f\n",__func__,_mm512_reduce_mul_pd(ret.v));
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
ret.v = vec_add(a.v,b.v);
|
ret.v = vec_add(a.v,b.v);
|
||||||
@ -210,7 +209,7 @@ namespace Grid {
|
|||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#error // Not implemented yet
|
#error // Not implemented yet
|
||||||
#endif
|
#endif
|
||||||
default: exit(EXIT_FAILURE); break;
|
default: assert(0); break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
void vload(zvec& a){
|
void vload(zvec& a){
|
||||||
@ -265,8 +264,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
|
|||||||
//Note v has a3 a2 a1 a0
|
//Note v has a3 a2 a1 a0
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
printf("%s Not implemented\n",__func__);
|
assert(0);
|
||||||
exit(-1);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
friend inline void vprefetch(const vComplexD &v)
|
friend inline void vprefetch(const vComplexD &v)
|
||||||
@ -294,7 +292,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
exit(0); // not implemented
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -193,7 +193,7 @@ namespace Grid {
|
|||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#error
|
#error
|
||||||
#endif
|
#endif
|
||||||
default: exit(EXIT_FAILURE); break;
|
default: assert(0); break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -235,8 +235,7 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
|
|||||||
//Note v has a3 a2 a1 a0
|
//Note v has a3 a2 a1 a0
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
printf("%s Not implemented\n",__func__);
|
assert(0);
|
||||||
exit(-1);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
friend inline void vprefetch(const vComplexF &v)
|
friend inline void vprefetch(const vComplexF &v)
|
||||||
@ -333,7 +332,7 @@ exit(-1);
|
|||||||
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag
|
ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
exit(0); // not implemented
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -161,7 +161,7 @@ namespace Grid {
|
|||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#error
|
#error
|
||||||
#endif
|
#endif
|
||||||
default: exit(EXIT_FAILURE); break;
|
default: assert(0);break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// gona be bye bye
|
// gona be bye bye
|
||||||
@ -214,8 +214,7 @@ namespace Grid {
|
|||||||
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
printf("%s Not implemented\n",__func__);
|
assert(0);
|
||||||
exit(-1);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
friend inline void vprefetch(const vRealD &v)
|
friend inline void vprefetch(const vRealD &v)
|
||||||
|
@ -185,7 +185,7 @@ namespace Grid {
|
|||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
#error not implemented
|
#error not implemented
|
||||||
#endif
|
#endif
|
||||||
default: exit(EXIT_FAILURE); break;
|
default: assert(0); break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -236,8 +236,7 @@ friend inline void vstore(vRealF &ret, float *a){
|
|||||||
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
// Note v has a7 a6 a5ba4 a3 a2 a1 a0
|
||||||
#endif
|
#endif
|
||||||
#ifdef QPX
|
#ifdef QPX
|
||||||
printf("%s Not implemented\n",__func__);
|
assert(0);
|
||||||
exit(-1);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user