mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Merge branch 'master' of github.com:paboyle/Grid
Conflicts: lib/qcd/action/fermion/WilsonFermion5D.cc
This commit is contained in:
commit
899ca41cb8
@ -93,12 +93,13 @@ int main (int argc, char ** argv)
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume*ncall;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
Dw.Report();
|
||||
}
|
||||
|
||||
|
||||
|
@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
|
||||
AC_CHECK_HEADERS(malloc/malloc.h)
|
||||
AC_CHECK_HEADERS(malloc.h)
|
||||
AC_CHECK_HEADERS(endian.h)
|
||||
AC_CHECK_HEADERS(execinfo.h)
|
||||
AC_CHECK_HEADERS(gmp.h)
|
||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
||||
|
11
lib/Init.cc
11
lib/Init.cc
@ -16,8 +16,9 @@
|
||||
#include <iterator>
|
||||
|
||||
#define __X86_64
|
||||
#define EXECINFO
|
||||
#ifdef EXECINFO
|
||||
|
||||
|
||||
#ifdef HAVE_EXECINFO_H
|
||||
#include <execinfo.h>
|
||||
#endif
|
||||
|
||||
@ -233,7 +234,9 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
printf(" mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
printf(" code %d\n",si->si_code);
|
||||
|
||||
#ifdef __X86_64
|
||||
// Linux/Posix
|
||||
#ifdef __linux__
|
||||
// And x86 64bit
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||
printf(" instruction %llx\n",(unsigned long long)sc->rip);
|
||||
@ -259,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
REG(r14);
|
||||
REG(r15);
|
||||
#endif
|
||||
#ifdef EXECINFO
|
||||
#ifdef HAVE_EXECINFO_H
|
||||
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
|
||||
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
|
||||
for (int i = 0; i < symbols; i++){
|
||||
|
392
lib/Stencil.h
392
lib/Stencil.h
@ -48,10 +48,14 @@ namespace Grid {
|
||||
int _around_the_world;
|
||||
};
|
||||
|
||||
template<class vobj,class cobj, class compressor>
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
typedef uint32_t StencilInteger;
|
||||
typedef typename cobj::vector_type vector_type;
|
||||
typedef typename cobj::scalar_type scalar_type;
|
||||
typedef typename cobj::scalar_object scalar_object;
|
||||
|
||||
int _checkerboard;
|
||||
int _npoints; // Move to template param?
|
||||
@ -66,33 +70,334 @@ namespace Grid {
|
||||
// npoints x Osites() of these
|
||||
std::vector<std::vector<StencilEntry> > _entries;
|
||||
|
||||
// Comms buffers
|
||||
std::vector<std::vector<scalar_object> > send_buf_extract;
|
||||
std::vector<std::vector<scalar_object> > recv_buf_extract;
|
||||
std::vector<scalar_object *> pointers;
|
||||
std::vector<scalar_object *> rpointers;
|
||||
Vector<cobj> send_buf;
|
||||
|
||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
||||
|
||||
int _unified_buffer_size;
|
||||
int _request_count;
|
||||
|
||||
double buftime;
|
||||
double gathertime;
|
||||
double commtime;
|
||||
double commstime;
|
||||
double halotime;
|
||||
double scattertime;
|
||||
double mergetime;
|
||||
double gathermtime;
|
||||
double splicetime;
|
||||
double nosplicetime;
|
||||
|
||||
CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances);
|
||||
|
||||
|
||||
|
||||
CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
||||
{
|
||||
gathertime=0;
|
||||
commtime=0;
|
||||
commstime=0;
|
||||
halotime=0;
|
||||
scattertime=0;
|
||||
mergetime=0;
|
||||
gathermtime=0;
|
||||
buftime=0;
|
||||
splicetime=0;
|
||||
nosplicetime=0;
|
||||
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
_directions = directions;
|
||||
_distances = distances;
|
||||
_unified_buffer_size=0;
|
||||
_request_count =0;
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
|
||||
int point = i;
|
||||
|
||||
_entries[i].resize( osites);
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
int shift = displacement;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
_permute_type[point]=_grid->PermuteType(dimension);
|
||||
|
||||
_checkerboard = checkerboard;
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int sshift[2];
|
||||
|
||||
// Underlying approach. For each local site build
|
||||
// up a table containing the npoint "neighbours" and whether they
|
||||
// live in lattice or a comms buffer.
|
||||
if ( !comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Local(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else { // All permute extract done in comms phase prior to Stencil application
|
||||
// So tables are the same whether comm_dim or splice_dim
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Comms(point,dimension,shift,0x3);
|
||||
// std::cout<<"Comms 0x3"<<std::endl;
|
||||
} else {
|
||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
||||
}
|
||||
}
|
||||
// for(int ss=0;ss<osites;ss++){
|
||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Local (int point, int dimension,int shiftpm,int cbmask)
|
||||
{
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
int shift = (shiftpm+fd)%fd;
|
||||
|
||||
// the permute type
|
||||
int permute_dim =_grid->PermuteDim(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * _grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
|
||||
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int wraparound=0;
|
||||
if ( (shiftpm==-1) && (sx>x) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if ( (shiftpm== 1) && (sx<x) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void Comms (int point,int dimension,int shiftpm,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int pd = _grid->_processors[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
// assert(simd_layout==1); // Why?
|
||||
assert(comm_dim==1);
|
||||
int shift = (shiftpm + fd) %fd;
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||
// send to one or more remote nodes.
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
int offnode = (comm_proc!= 0);
|
||||
|
||||
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
|
||||
int wraparound=0;
|
||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if (!offnode) {
|
||||
|
||||
int permute_slice=0;
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
int unified_buffer_offset = _unified_buffer_size;
|
||||
_unified_buffer_size += words;
|
||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||
_entries[point][lo+o+b]._is_local=1;
|
||||
_entries[point][lo+o+b]._permute=permute;
|
||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||
_entries[point][lo+o+b]._is_local=1;
|
||||
_entries[point][lo+o+b]._permute=permute;
|
||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||
}
|
||||
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||
_entries[point][so+o+b]._is_local=0;
|
||||
_entries[point][so+o+b]._permute=0;
|
||||
_entries[point][so+o+b]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||
_entries[point][so+o+b]._is_local=0;
|
||||
_entries[point][so+o+b]._permute =0;
|
||||
_entries[point][so+o+b]._around_the_world=wrap;
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CartesianStencil(GridBase *grid,
|
||||
// int npoints,
|
||||
// int checkerboard,
|
||||
// const std::vector<int> &directions,
|
||||
// const std::vector<int> &distances);
|
||||
|
||||
|
||||
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
||||
// Can this be avoided with simpler coding of comms?
|
||||
void Local (int point, int dimension,int shift,int cbmask);
|
||||
void Comms (int point, int dimension,int shift,int cbmask);
|
||||
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
|
||||
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
|
||||
// void Local (int point, int dimension,int shift,int cbmask);
|
||||
// void Comms (int point, int dimension,int shift,int cbmask);
|
||||
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
|
||||
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
|
||||
|
||||
// Could allow a functional munging of the halo to another type during the comms.
|
||||
// this could implement the 16bit/32bit/64bit compression.
|
||||
template<class vobj,class cobj, class compressor> void
|
||||
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
||||
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
||||
{
|
||||
// conformable(source._grid,_grid);
|
||||
assert(source._grid==_grid);
|
||||
halotime-=usecond();
|
||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||
int u_comm_offset=0;
|
||||
|
||||
@ -124,27 +429,35 @@ namespace Grid {
|
||||
if ( comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
if (splice_dim) {
|
||||
splicetime-=usecond();
|
||||
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
||||
splicetime+=usecond();
|
||||
} else {
|
||||
nosplicetime-=usecond();
|
||||
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
||||
nosplicetime+=usecond();
|
||||
}
|
||||
} else {
|
||||
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
if(splice_dim){
|
||||
splicetime-=usecond();
|
||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
|
||||
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
|
||||
splicetime+=usecond();
|
||||
} else {
|
||||
nosplicetime-=usecond();
|
||||
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
|
||||
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
|
||||
nosplicetime+=usecond();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
halotime+=usecond();
|
||||
}
|
||||
|
||||
template<class vobj,class cobj, class compressor>
|
||||
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
||||
int &u_comm_offset,compressor & compress)
|
||||
@ -168,8 +481,7 @@ namespace Grid {
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
|
||||
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
|
||||
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
|
||||
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||
@ -186,7 +498,9 @@ namespace Grid {
|
||||
|
||||
int bytes = words * sizeof(cobj);
|
||||
|
||||
gathertime-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
|
||||
gathertime+=usecond();
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
@ -196,32 +510,27 @@ namespace Grid {
|
||||
assert (recv_from_rank != _grid->ThisRank());
|
||||
|
||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||
commtime-=usecond();
|
||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
(void *)&u_comm_buf[u_comm_offset],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
commtime+=usecond();
|
||||
|
||||
for(int i=0;i<words;i++){
|
||||
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
||||
// std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<" mask 0x"<<cbmask<<std::endl;
|
||||
}
|
||||
u_comm_offset+=words;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class vobj,class cobj, class compressor>
|
||||
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
||||
int &u_comm_offset,compressor &compress)
|
||||
{
|
||||
buftime-=usecond();
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
|
||||
typedef typename cobj::vector_type vector_type;
|
||||
typedef typename cobj::scalar_type scalar_type;
|
||||
typedef typename cobj::scalar_object scalar_object;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@ -244,17 +553,22 @@ namespace Grid {
|
||||
int words = sizeof(cobj)/sizeof(vector_type);
|
||||
|
||||
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
||||
/*
|
||||
* possibly slow to allocate
|
||||
* Doesn't matter in this test, but may want to preallocate in the
|
||||
* dirac operators
|
||||
*/
|
||||
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
std::vector<scalar_object *> pointers(Nsimd); //
|
||||
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
|
||||
// Should grow to max size and then cost very little thereafter
|
||||
send_buf_extract.resize(Nsimd);
|
||||
recv_buf_extract.resize(Nsimd);
|
||||
for(int l=0;l<Nsimd;l++){
|
||||
if( send_buf_extract[l].size() < buffer_size) {
|
||||
send_buf_extract[l].resize(buffer_size);
|
||||
recv_buf_extract[l].resize(buffer_size);
|
||||
}
|
||||
}
|
||||
pointers.resize(Nsimd);
|
||||
rpointers.resize(Nsimd);
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
buftime+=usecond();
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
@ -275,7 +589,9 @@ namespace Grid {
|
||||
}
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
gathermtime-=usecond();
|
||||
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
||||
gathermtime+=usecond();
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
@ -302,11 +618,13 @@ namespace Grid {
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
commstime-=usecond();
|
||||
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf_extract[i][0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
commstime+=usecond();
|
||||
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
|
||||
@ -316,11 +634,13 @@ namespace Grid {
|
||||
}
|
||||
|
||||
// Here we don't want to scatter, just place into a buffer.
|
||||
mergetime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
assert(u_comm_offset+i<_unified_buffer_size);
|
||||
// assert(u_comm_offset+i<_unified_buffer_size);
|
||||
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
||||
}
|
||||
|
||||
mergetime+=usecond();
|
||||
u_comm_offset+=buffer_size;
|
||||
}
|
||||
}
|
||||
|
@ -170,7 +170,7 @@ namespace Grid {
|
||||
////////////////////
|
||||
Geometry geom;
|
||||
GridBase * _grid;
|
||||
CartesianStencil Stencil;
|
||||
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
|
||||
|
||||
std::vector<CoarseMatrix> A;
|
||||
|
||||
|
@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
int bo=0;
|
||||
//PARALLEL_NESTED_LOOP21
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
// int bo = n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
|
||||
if ( cbmask == 0x3 ) {
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int bo = n*rhs._grid->_slice_block[dimension];
|
||||
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int bo=0;
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb &cbmask ) {
|
||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
//PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
||||
if ( cbmask ==0x3){
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb & cbmask ) {
|
||||
cobj temp;
|
||||
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
extract<cobj>(temp,pointers,offset);
|
||||
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
assert(0); //Fixme think this is buggy
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o=n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
|
||||
if ( ocb & cbmask ) {
|
||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||
extract<cobj>(temp,pointers,offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
int bo=0;
|
||||
//PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
// int bo =n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
|
||||
if ( cbmask ==0x3 ) {
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int bo =n*rhs._grid->_slice_block[dimension];
|
||||
rhs._odata[so+o+b]=buffer[bo+b];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int bo=0;
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int bo =n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
rhs._odata[so+o+b]=buffer[bo++];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
|
||||
if(cbmask ==0x3 ) {
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
merge(rhs._odata[so+o+b],pointers,offset);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(0); // think this is buggy FIXME
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*rhs._grid->_slice_stride[dimension];
|
||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
merge(rhs._odata[so+o+b],pointers,offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
|
||||
if(cbmask == 0x3 ){
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||
if ( ocb&cbmask ) {
|
||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||
if ( ocb&cbmask ) {
|
||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,7 @@ namespace Grid {
|
||||
// and Methods:
|
||||
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
|
||||
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
|
||||
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
||||
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
||||
//
|
||||
@ -101,6 +101,7 @@ namespace Grid {
|
||||
typedef typename Impl::SiteSpinor SiteSpinor; \
|
||||
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
|
||||
typedef typename Impl::Compressor Compressor; \
|
||||
typedef typename Impl::StencilImpl StencilImpl; \
|
||||
typedef typename Impl::ImplParams ImplParams;
|
||||
|
||||
///////
|
||||
@ -112,7 +113,6 @@ namespace Grid {
|
||||
|
||||
typedef ImplGauge<S,Nrepresentation> Gimpl;
|
||||
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||
@ -128,10 +128,11 @@ namespace Grid {
|
||||
|
||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||
typedef WilsonImplParams ImplParams;
|
||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
||||
ImplParams Params;
|
||||
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||
|
||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
|
||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
|
||||
mult(&phi(),&U(mu),&chi());
|
||||
}
|
||||
|
||||
@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
||||
|
||||
typedef GparityWilsonImplParams ImplParams;
|
||||
ImplParams Params;
|
||||
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||
|
||||
|
||||
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
|
||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
|
||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
|
||||
|
||||
typedef SiteHalfSpinor vobj;
|
||||
typedef typename SiteHalfSpinor::scalar_object sobj;
|
||||
|
@ -109,7 +109,7 @@ namespace QCD {
|
||||
///////////////////////////////////
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
|
||||
void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
@ -123,7 +123,7 @@ namespace QCD {
|
||||
FermionField Atilde(B._grid);
|
||||
Atilde = A;
|
||||
|
||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
|
||||
st.HaloExchange(B,comm_buf,compressor);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) {
|
||||
|
||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor(dag);
|
||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
||||
st.HaloExchange(in,comm_buf,compressor);
|
||||
|
||||
if ( dag == DaggerYes ) {
|
||||
if( HandOptDslash ) {
|
||||
|
@ -73,14 +73,14 @@ namespace Grid {
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Extra methods added by derived
|
||||
///////////////////////////////////////////////////////////////
|
||||
void DerivInternal(CartesianStencil & st,
|
||||
void DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag);
|
||||
|
||||
void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
|
||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) ;
|
||||
|
||||
|
||||
@ -108,9 +108,9 @@ namespace Grid {
|
||||
GridBase * _cbgrid;
|
||||
|
||||
//Defines the stencils for even and odd
|
||||
CartesianStencil Stencil;
|
||||
CartesianStencil StencilEven;
|
||||
CartesianStencil StencilOdd;
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@ -70,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
||||
|
||||
ImportGauge(_Umu);
|
||||
commtime=0;
|
||||
dslashtime=0;
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
@ -87,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
||||
|
||||
Compressor compressor(DaggerNo);
|
||||
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||
|
||||
int skip = (disp==1) ? 0 : 1;
|
||||
|
||||
@ -107,7 +109,7 @@ PARALLEL_FOR_LOOP
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
|
||||
void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
@ -124,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
|
||||
FermionField Btilde(B._grid);
|
||||
FermionField Atilde(B._grid);
|
||||
|
||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
|
||||
st.HaloExchange(B,comm_buf,compressor);
|
||||
|
||||
Atilde=A;
|
||||
|
||||
@ -196,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
||||
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Report(void)
|
||||
{
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
const FermionField &A,
|
||||
@ -214,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
|
||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@ -229,13 +252,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
|
||||
int cores = GridThread::GetCores();
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
||||
commtime -=usecond();
|
||||
st.HaloExchange(in,comm_buf,compressor);
|
||||
commtime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
// Not loop ordering and data layout.
|
||||
// Designed to create
|
||||
// - per thread reuse in L1 cache for U
|
||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||
dslashtime -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
#pragma omp parallel for schedule(static)
|
||||
@ -349,6 +375,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||
|
@ -32,7 +32,8 @@ namespace Grid {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef WilsonKernels<Impl> Kernels;
|
||||
|
||||
double commtime;
|
||||
double dslashtime;
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
@ -73,14 +74,14 @@ namespace Grid {
|
||||
///////////////////////////////////////////////////////////////
|
||||
// New methods added
|
||||
///////////////////////////////////////////////////////////////
|
||||
void DerivInternal(CartesianStencil & st,
|
||||
void DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag);
|
||||
|
||||
void DhopInternal(CartesianStencil & st,
|
||||
void DhopInternal(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
@ -98,6 +99,7 @@ namespace Grid {
|
||||
// DoubleStore
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
|
||||
void Report(void);
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
@ -113,9 +115,9 @@ namespace Grid {
|
||||
int Ls;
|
||||
|
||||
//Defines the stencils for even and odd
|
||||
CartesianStencil Stencil;
|
||||
CartesianStencil StencilEven;
|
||||
CartesianStencil StencilOdd;
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@ -3,7 +3,7 @@ namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
|
||||
{
|
||||
|
@ -17,15 +17,15 @@ namespace Grid {
|
||||
typedef FermionOperator<Impl> Base;
|
||||
|
||||
public:
|
||||
void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in,FermionField &out);
|
||||
|
||||
void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
||||
#if defined(AVX512) || defined(IMCI)
|
||||
@ -41,23 +41,23 @@ namespace Grid {
|
||||
#endif
|
||||
#define HANDOPT
|
||||
#ifdef HANDOPT
|
||||
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
#else
|
||||
|
||||
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
}
|
||||
|
||||
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
|
@ -75,7 +75,7 @@ namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
|
||||
{
|
||||
|
@ -282,7 +282,7 @@ namespace QCD {
|
||||
|
||||
#ifdef HANDOPT
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
|
@ -204,7 +204,7 @@ namespace Optimization {
|
||||
#if defined (AVX2)
|
||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
}
|
||||
@ -248,8 +248,8 @@ namespace Optimization {
|
||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
|
||||
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
|
||||
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||
#endif
|
||||
|
@ -1,275 +1,6 @@
|
||||
#include "Grid.h"
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
CartesianStencil::CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
||||
{
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
_directions = directions;
|
||||
_distances = distances;
|
||||
_unified_buffer_size=0;
|
||||
_request_count =0;
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
|
||||
int point = i;
|
||||
|
||||
_entries[i].resize( osites);
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
int shift = displacement;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
_permute_type[point]=_grid->PermuteType(dimension);
|
||||
|
||||
_checkerboard = checkerboard;
|
||||
|
||||
// the permute type
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||
|
||||
int sshift[2];
|
||||
|
||||
// Underlying approach. For each local site build
|
||||
// up a table containing the npoint "neighbours" and whether they
|
||||
// live in lattice or a comms buffer.
|
||||
if ( !comm_dim ) {
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Local(point,dimension,shift,0x3);
|
||||
} else {
|
||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
} else { // All permute extract done in comms phase prior to Stencil application
|
||||
// So tables are the same whether comm_dim or splice_dim
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Comms(point,dimension,shift,0x3);
|
||||
// std::cout<<"Comms 0x3"<<std::endl;
|
||||
} else {
|
||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
||||
}
|
||||
}
|
||||
// for(int ss=0;ss<osites;ss++){
|
||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CartesianStencil::Local (int point, int dimension,int shiftpm,int cbmask)
|
||||
{
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int gd = _grid->_gdimensions[dimension];
|
||||
|
||||
// Map to always positive shift modulo global full dimension.
|
||||
int shift = (shiftpm+fd)%fd;
|
||||
|
||||
// the permute type
|
||||
int permute_dim =_grid->PermuteDim(dimension);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int o = 0;
|
||||
int bo = x * _grid->_ostride[dimension];
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
|
||||
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int wraparound=0;
|
||||
if ( (shiftpm==-1) && (sx>x) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if ( (shiftpm== 1) && (sx<x) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
int num = sshift%rd;
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice = 1-wrap;
|
||||
}
|
||||
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianStencil::Comms (int point,int dimension,int shiftpm,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int pd = _grid->_processors[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
// assert(simd_layout==1); // Why?
|
||||
assert(comm_dim==1);
|
||||
int shift = (shiftpm + fd) %fd;
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||
// send to one or more remote nodes.
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
int offnode = (comm_proc!= 0);
|
||||
|
||||
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
|
||||
int wraparound=0;
|
||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
|
||||
wraparound = 1;
|
||||
}
|
||||
if (!offnode) {
|
||||
|
||||
int permute_slice=0;
|
||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
||||
|
||||
int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
int unified_buffer_offset = _unified_buffer_size;
|
||||
_unified_buffer_size += words;
|
||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int o = 0; // relative offset to base within plane
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||
_entries[point][lo+o+b]._is_local=1;
|
||||
_entries[point][lo+o+b]._permute=permute;
|
||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||
_entries[point][lo+o+b]._is_local=1;
|
||||
_entries[point][lo+o+b]._permute=permute;
|
||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||
}
|
||||
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
|
||||
{
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
|
||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
// Simple block stride gather of SIMD objects
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||
_entries[point][so+o+b]._is_local=0;
|
||||
_entries[point][so+o+b]._permute=0;
|
||||
_entries[point][so+o+b]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||
int o = 0; // relative offset to base within plane
|
||||
int bo = 0; // offset in buffer
|
||||
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||
_entries[point][so+o+b]._is_local=0;
|
||||
_entries[point][so+o+b]._permute =0;
|
||||
_entries[point][so+o+b]._around_the_world=wrap;
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
|
||||
template<class vobj> inline
|
||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
|
||||
{
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
const int Nsimd=vobj::vector_type::Nsimd();
|
||||
|
||||
int Nextr=extracted.size();
|
||||
int s = Nsimd/Nextr;
|
||||
scalar_type * vp = (scalar_type *)&vec;
|
||||
|
||||
std::vector<scalar_type *> pointers(Nsimd);
|
||||
for(int i=0;i<Nextr;i++) {
|
||||
pointers[i] =(scalar_type *)& extracted[i][offset];
|
||||
}
|
||||
|
||||
vector_type *vp = (vector_type *)&vec;
|
||||
for(int w=0;w<words;w++){
|
||||
extract<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||
for(int i=0;i<Nextr;i++){
|
||||
scalar_type * pointer = (scalar_type *)& extracted[i][offset];
|
||||
pointer[w] = vp[i*s+w*Nsimd];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
int Nextr=extracted.size();
|
||||
int s=Nsimd/Nextr;
|
||||
|
||||
std::vector<scalar_type *> pointers(Nextr);
|
||||
for(int i=0;i<Nextr;i++)
|
||||
pointers[i] =(scalar_type *)& extracted[i][offset];
|
||||
|
||||
vector_type *vp = (vector_type *)&vec;
|
||||
scalar_type *pointer;
|
||||
scalar_type *vp = (scalar_type *)&vec;
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
merge<vector_type,scalar_type>(&vp[w],pointers,w);
|
||||
for(int i=0;i<Nextr;i++){
|
||||
for(int ii=0;ii<s;ii++){
|
||||
pointer=(scalar_type *)&extracted[i][offset];
|
||||
vp[w*Nsimd+i*s+ii] = pointer[w];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
|
||||
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
|
||||
|
||||
|
||||
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
|
||||
|
Loading…
Reference in New Issue
Block a user