mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Merge branch 'master' of github.com:paboyle/Grid
Conflicts: lib/qcd/action/fermion/WilsonFermion5D.cc
This commit is contained in:
commit
899ca41cb8
@ -93,12 +93,13 @@ int main (int argc, char ** argv)
|
|||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
double flops=1344*volume*ncall;
|
double flops=1344*volume*ncall;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
Dw.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
|
|||||||
AC_CHECK_HEADERS(malloc/malloc.h)
|
AC_CHECK_HEADERS(malloc/malloc.h)
|
||||||
AC_CHECK_HEADERS(malloc.h)
|
AC_CHECK_HEADERS(malloc.h)
|
||||||
AC_CHECK_HEADERS(endian.h)
|
AC_CHECK_HEADERS(endian.h)
|
||||||
|
AC_CHECK_HEADERS(execinfo.h)
|
||||||
AC_CHECK_HEADERS(gmp.h)
|
AC_CHECK_HEADERS(gmp.h)
|
||||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
||||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
||||||
|
11
lib/Init.cc
11
lib/Init.cc
@ -16,8 +16,9 @@
|
|||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
|
||||||
#define __X86_64
|
#define __X86_64
|
||||||
#define EXECINFO
|
|
||||||
#ifdef EXECINFO
|
|
||||||
|
#ifdef HAVE_EXECINFO_H
|
||||||
#include <execinfo.h>
|
#include <execinfo.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -233,7 +234,9 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
printf(" mem address %llx\n",(unsigned long long)si->si_addr);
|
printf(" mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
printf(" code %d\n",si->si_code);
|
printf(" code %d\n",si->si_code);
|
||||||
|
|
||||||
#ifdef __X86_64
|
// Linux/Posix
|
||||||
|
#ifdef __linux__
|
||||||
|
// And x86 64bit
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
printf(" instruction %llx\n",(unsigned long long)sc->rip);
|
printf(" instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
@ -259,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
REG(r14);
|
REG(r14);
|
||||||
REG(r15);
|
REG(r15);
|
||||||
#endif
|
#endif
|
||||||
#ifdef EXECINFO
|
#ifdef HAVE_EXECINFO_H
|
||||||
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
|
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
|
||||||
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
|
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
|
||||||
for (int i = 0; i < symbols; i++){
|
for (int i = 0; i < symbols; i++){
|
||||||
|
392
lib/Stencil.h
392
lib/Stencil.h
@ -48,10 +48,14 @@ namespace Grid {
|
|||||||
int _around_the_world;
|
int _around_the_world;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class vobj,class cobj, class compressor>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef uint32_t StencilInteger;
|
typedef uint32_t StencilInteger;
|
||||||
|
typedef typename cobj::vector_type vector_type;
|
||||||
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
int _checkerboard;
|
int _checkerboard;
|
||||||
int _npoints; // Move to template param?
|
int _npoints; // Move to template param?
|
||||||
@ -66,33 +70,334 @@ namespace Grid {
|
|||||||
// npoints x Osites() of these
|
// npoints x Osites() of these
|
||||||
std::vector<std::vector<StencilEntry> > _entries;
|
std::vector<std::vector<StencilEntry> > _entries;
|
||||||
|
|
||||||
|
// Comms buffers
|
||||||
|
std::vector<std::vector<scalar_object> > send_buf_extract;
|
||||||
|
std::vector<std::vector<scalar_object> > recv_buf_extract;
|
||||||
|
std::vector<scalar_object *> pointers;
|
||||||
|
std::vector<scalar_object *> rpointers;
|
||||||
|
Vector<cobj> send_buf;
|
||||||
|
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
||||||
|
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
int _request_count;
|
int _request_count;
|
||||||
|
|
||||||
|
double buftime;
|
||||||
|
double gathertime;
|
||||||
|
double commtime;
|
||||||
|
double commstime;
|
||||||
|
double halotime;
|
||||||
|
double scattertime;
|
||||||
|
double mergetime;
|
||||||
|
double gathermtime;
|
||||||
|
double splicetime;
|
||||||
|
double nosplicetime;
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
|
||||||
int npoints,
|
|
||||||
int checkerboard,
|
|
||||||
const std::vector<int> &directions,
|
CartesianStencil(GridBase *grid,
|
||||||
const std::vector<int> &distances);
|
int npoints,
|
||||||
|
int checkerboard,
|
||||||
|
const std::vector<int> &directions,
|
||||||
|
const std::vector<int> &distances)
|
||||||
|
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
||||||
|
{
|
||||||
|
gathertime=0;
|
||||||
|
commtime=0;
|
||||||
|
commstime=0;
|
||||||
|
halotime=0;
|
||||||
|
scattertime=0;
|
||||||
|
mergetime=0;
|
||||||
|
gathermtime=0;
|
||||||
|
buftime=0;
|
||||||
|
splicetime=0;
|
||||||
|
nosplicetime=0;
|
||||||
|
|
||||||
|
_npoints = npoints;
|
||||||
|
_grid = grid;
|
||||||
|
_directions = directions;
|
||||||
|
_distances = distances;
|
||||||
|
_unified_buffer_size=0;
|
||||||
|
_request_count =0;
|
||||||
|
|
||||||
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
for(int i=0;i<npoints;i++){
|
||||||
|
|
||||||
|
int point = i;
|
||||||
|
|
||||||
|
_entries[i].resize( osites);
|
||||||
|
|
||||||
|
int dimension = directions[i];
|
||||||
|
int displacement = distances[i];
|
||||||
|
int shift = displacement;
|
||||||
|
|
||||||
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
_permute_type[point]=_grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
_checkerboard = checkerboard;
|
||||||
|
|
||||||
|
// the permute type
|
||||||
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
|
|
||||||
|
int sshift[2];
|
||||||
|
|
||||||
|
// Underlying approach. For each local site build
|
||||||
|
// up a table containing the npoint "neighbours" and whether they
|
||||||
|
// live in lattice or a comms buffer.
|
||||||
|
if ( !comm_dim ) {
|
||||||
|
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||||
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
|
|
||||||
|
if ( sshift[0] == sshift[1] ) {
|
||||||
|
Local(point,dimension,shift,0x3);
|
||||||
|
} else {
|
||||||
|
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
|
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
|
}
|
||||||
|
} else { // All permute extract done in comms phase prior to Stencil application
|
||||||
|
// So tables are the same whether comm_dim or splice_dim
|
||||||
|
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||||
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
|
if ( sshift[0] == sshift[1] ) {
|
||||||
|
Comms(point,dimension,shift,0x3);
|
||||||
|
// std::cout<<"Comms 0x3"<<std::endl;
|
||||||
|
} else {
|
||||||
|
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
|
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
|
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// for(int ss=0;ss<osites;ss++){
|
||||||
|
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
||||||
|
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void Local (int point, int dimension,int shiftpm,int cbmask)
|
||||||
|
{
|
||||||
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
int ld = _grid->_ldimensions[dimension];
|
||||||
|
int gd = _grid->_gdimensions[dimension];
|
||||||
|
|
||||||
|
// Map to always positive shift modulo global full dimension.
|
||||||
|
int shift = (shiftpm+fd)%fd;
|
||||||
|
|
||||||
|
// the permute type
|
||||||
|
int permute_dim =_grid->PermuteDim(dimension);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int o = 0;
|
||||||
|
int bo = x * _grid->_ostride[dimension];
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
|
||||||
|
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
int wraparound=0;
|
||||||
|
if ( (shiftpm==-1) && (sx>x) ) {
|
||||||
|
wraparound = 1;
|
||||||
|
}
|
||||||
|
if ( (shiftpm== 1) && (sx<x) ) {
|
||||||
|
wraparound = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
if(permute_dim){
|
||||||
|
int wrap = sshift/rd;
|
||||||
|
int num = sshift%rd;
|
||||||
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
|
else permute_slice = 1-wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Comms (int point,int dimension,int shiftpm,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=_grid;
|
||||||
|
|
||||||
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int ld = _grid->_ldimensions[dimension];
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
int pd = _grid->_processors[dimension];
|
||||||
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
// assert(simd_layout==1); // Why?
|
||||||
|
assert(comm_dim==1);
|
||||||
|
int shift = (shiftpm + fd) %fd;
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
|
||||||
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
|
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||||
|
// send to one or more remote nodes.
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
int offnode = (comm_proc!= 0);
|
||||||
|
|
||||||
|
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
|
||||||
|
int wraparound=0;
|
||||||
|
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||||
|
wraparound = 1;
|
||||||
|
}
|
||||||
|
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
|
||||||
|
wraparound = 1;
|
||||||
|
}
|
||||||
|
if (!offnode) {
|
||||||
|
|
||||||
|
int permute_slice=0;
|
||||||
|
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
||||||
|
|
||||||
|
int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
int unified_buffer_offset = _unified_buffer_size;
|
||||||
|
_unified_buffer_size += words;
|
||||||
|
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||||
|
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
|
||||||
|
{
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||||
|
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
||||||
|
|
||||||
|
// Simple block stride gather of SIMD objects
|
||||||
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
|
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||||
|
_entries[point][lo+o+b]._is_local=1;
|
||||||
|
_entries[point][lo+o+b]._permute=permute;
|
||||||
|
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||||
|
}
|
||||||
|
o +=_grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
|
||||||
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
_entries[point][lo+o+b]._offset =ro+o+b;
|
||||||
|
_entries[point][lo+o+b]._is_local=1;
|
||||||
|
_entries[point][lo+o+b]._permute=permute;
|
||||||
|
_entries[point][lo+o+b]._around_the_world=wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
o +=_grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
||||||
|
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
|
||||||
|
{
|
||||||
|
int rd = _grid->_rdimensions[dimension];
|
||||||
|
|
||||||
|
if ( !_grid->CheckerBoarded(dimension) ) {
|
||||||
|
|
||||||
|
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
|
// Simple block stride gather of SIMD objects
|
||||||
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
|
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||||
|
_entries[point][so+o+b]._is_local=0;
|
||||||
|
_entries[point][so+o+b]._permute=0;
|
||||||
|
_entries[point][so+o+b]._around_the_world=wrap;
|
||||||
|
}
|
||||||
|
o +=_grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
||||||
|
int o = 0; // relative offset to base within plane
|
||||||
|
int bo = 0; // offset in buffer
|
||||||
|
|
||||||
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
|
|
||||||
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
_entries[point][so+o+b]._offset =offset+(bo++);
|
||||||
|
_entries[point][so+o+b]._is_local=0;
|
||||||
|
_entries[point][so+o+b]._permute =0;
|
||||||
|
_entries[point][so+o+b]._around_the_world=wrap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
o +=_grid->_slice_stride[dimension];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CartesianStencil(GridBase *grid,
|
||||||
|
// int npoints,
|
||||||
|
// int checkerboard,
|
||||||
|
// const std::vector<int> &directions,
|
||||||
|
// const std::vector<int> &distances);
|
||||||
|
|
||||||
|
|
||||||
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
||||||
// Can this be avoided with simpler coding of comms?
|
// Can this be avoided with simpler coding of comms?
|
||||||
void Local (int point, int dimension,int shift,int cbmask);
|
// void Local (int point, int dimension,int shift,int cbmask);
|
||||||
void Comms (int point, int dimension,int shift,int cbmask);
|
// void Comms (int point, int dimension,int shift,int cbmask);
|
||||||
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
|
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
|
||||||
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
|
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
|
||||||
|
|
||||||
// Could allow a functional munging of the halo to another type during the comms.
|
// Could allow a functional munging of the halo to another type during the comms.
|
||||||
// this could implement the 16bit/32bit/64bit compression.
|
// this could implement the 16bit/32bit/64bit compression.
|
||||||
template<class vobj,class cobj, class compressor> void
|
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
||||||
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
|
||||||
{
|
{
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
|
halotime-=usecond();
|
||||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||||
int u_comm_offset=0;
|
int u_comm_offset=0;
|
||||||
|
|
||||||
@ -124,27 +429,35 @@ namespace Grid {
|
|||||||
if ( comm_dim ) {
|
if ( comm_dim ) {
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
if (splice_dim) {
|
if (splice_dim) {
|
||||||
|
splicetime-=usecond();
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
||||||
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
|
nosplicetime-=usecond();
|
||||||
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
||||||
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||||
if(splice_dim){
|
if(splice_dim){
|
||||||
|
splicetime-=usecond();
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
|
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
|
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
|
||||||
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
|
nosplicetime-=usecond();
|
||||||
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
|
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
|
||||||
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
|
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
|
||||||
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
halotime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
|
||||||
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
||||||
int &u_comm_offset,compressor & compress)
|
int &u_comm_offset,compressor & compress)
|
||||||
@ -168,8 +481,7 @@ namespace Grid {
|
|||||||
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
|
|
||||||
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
|
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
|
||||||
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||||
@ -186,7 +498,9 @@ namespace Grid {
|
|||||||
|
|
||||||
int bytes = words * sizeof(cobj);
|
int bytes = words * sizeof(cobj);
|
||||||
|
|
||||||
|
gathertime-=usecond();
|
||||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
|
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
|
||||||
|
gathertime+=usecond();
|
||||||
|
|
||||||
int rank = _grid->_processor;
|
int rank = _grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
@ -196,32 +510,27 @@ namespace Grid {
|
|||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||||
|
commtime-=usecond();
|
||||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&u_comm_buf[u_comm_offset],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
commtime+=usecond();
|
||||||
|
|
||||||
for(int i=0;i<words;i++){
|
|
||||||
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
|
||||||
// std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<" mask 0x"<<cbmask<<std::endl;
|
|
||||||
}
|
|
||||||
u_comm_offset+=words;
|
u_comm_offset+=words;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
|
||||||
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
||||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
||||||
int &u_comm_offset,compressor &compress)
|
int &u_comm_offset,compressor &compress)
|
||||||
{
|
{
|
||||||
|
buftime-=usecond();
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
|
|
||||||
typedef typename cobj::vector_type vector_type;
|
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
|
||||||
typedef typename cobj::scalar_object scalar_object;
|
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -244,17 +553,22 @@ namespace Grid {
|
|||||||
int words = sizeof(cobj)/sizeof(vector_type);
|
int words = sizeof(cobj)/sizeof(vector_type);
|
||||||
|
|
||||||
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
||||||
/*
|
|
||||||
* possibly slow to allocate
|
|
||||||
* Doesn't matter in this test, but may want to preallocate in the
|
|
||||||
* dirac operators
|
|
||||||
*/
|
|
||||||
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
|
||||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
|
||||||
|
|
||||||
std::vector<scalar_object *> pointers(Nsimd); //
|
// Should grow to max size and then cost very little thereafter
|
||||||
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
|
send_buf_extract.resize(Nsimd);
|
||||||
|
recv_buf_extract.resize(Nsimd);
|
||||||
|
for(int l=0;l<Nsimd;l++){
|
||||||
|
if( send_buf_extract[l].size() < buffer_size) {
|
||||||
|
send_buf_extract[l].resize(buffer_size);
|
||||||
|
recv_buf_extract[l].resize(buffer_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pointers.resize(Nsimd);
|
||||||
|
rpointers.resize(Nsimd);
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
buftime+=usecond();
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
@ -275,7 +589,9 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
gathermtime-=usecond();
|
||||||
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
||||||
|
gathermtime+=usecond();
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
@ -302,11 +618,13 @@ namespace Grid {
|
|||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
commstime-=usecond();
|
||||||
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf_extract[i][0],
|
(void *)&recv_buf_extract[i][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
commstime+=usecond();
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
|
||||||
@ -316,11 +634,13 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Here we don't want to scatter, just place into a buffer.
|
// Here we don't want to scatter, just place into a buffer.
|
||||||
|
mergetime-=usecond();
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
for(int i=0;i<buffer_size;i++){
|
for(int i=0;i<buffer_size;i++){
|
||||||
assert(u_comm_offset+i<_unified_buffer_size);
|
// assert(u_comm_offset+i<_unified_buffer_size);
|
||||||
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
||||||
}
|
}
|
||||||
|
mergetime+=usecond();
|
||||||
u_comm_offset+=buffer_size;
|
u_comm_offset+=buffer_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -170,7 +170,7 @@ namespace Grid {
|
|||||||
////////////////////
|
////////////////////
|
||||||
Geometry geom;
|
Geometry geom;
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
CartesianStencil Stencil;
|
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
|
@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP21
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int b=0;b<e2;b++){
|
for(int n=0;n<e1;n++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
for(int b=0;b<e2;b++){
|
||||||
// int bo = n*rhs._grid->_slice_block[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int bo = n*rhs._grid->_slice_block[dimension];
|
||||||
if ( ocb &cbmask ) {
|
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb &cbmask ) {
|
||||||
|
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
//PARALLEL_NESTED_LOOP2
|
|
||||||
for(int n=0;n<e1;n++){
|
if ( cbmask ==0x3){
|
||||||
for(int b=0;b<e2;b++){
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o=n*rhs._grid->_slice_stride[dimension];
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
cobj temp;
|
|
||||||
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
assert(0); //Fixme think this is buggy
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o=n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
int bo=0;
|
|
||||||
//PARALLEL_NESTED_LOOP2
|
if ( cbmask ==0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int b=0;b<e2;b++){
|
for(int n=0;n<e1;n++){
|
||||||
int o =n*rhs._grid->_slice_stride[dimension];
|
for(int b=0;b<e2;b++){
|
||||||
// int bo =n*rhs._grid->_slice_block[dimension];
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
if ( ocb & cbmask ) {
|
rhs._odata[so+o+b]=buffer[bo+b];
|
||||||
rhs._odata[so+o+b]=buffer[bo++];
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int bo=0;
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int bo =n*rhs._grid->_slice_block[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
rhs._odata[so+o+b]=buffer[bo++];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if(cbmask ==0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
merge(rhs._odata[so+o+b],pointers,offset);
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
assert(0); // think this is buggy FIXME
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
merge(rhs._odata[so+o+b],pointers,offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
|
if(cbmask == 0x3 ){
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
|
||||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PARALLEL_NESTED_LOOP2
|
||||||
|
for(int n=0;n<e1;n++){
|
||||||
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
|
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||||
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||||
|
if ( ocb&cbmask ) {
|
||||||
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ namespace Grid {
|
|||||||
// and Methods:
|
// and Methods:
|
||||||
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
|
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
|
||||||
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
||||||
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
|
||||||
//
|
//
|
||||||
@ -101,6 +101,7 @@ namespace Grid {
|
|||||||
typedef typename Impl::SiteSpinor SiteSpinor; \
|
typedef typename Impl::SiteSpinor SiteSpinor; \
|
||||||
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
|
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
|
||||||
typedef typename Impl::Compressor Compressor; \
|
typedef typename Impl::Compressor Compressor; \
|
||||||
|
typedef typename Impl::StencilImpl StencilImpl; \
|
||||||
typedef typename Impl::ImplParams ImplParams;
|
typedef typename Impl::ImplParams ImplParams;
|
||||||
|
|
||||||
///////
|
///////
|
||||||
@ -112,7 +113,6 @@ namespace Grid {
|
|||||||
|
|
||||||
typedef ImplGauge<S,Nrepresentation> Gimpl;
|
typedef ImplGauge<S,Nrepresentation> Gimpl;
|
||||||
|
|
||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||||
@ -128,10 +128,11 @@ namespace Grid {
|
|||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
|
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||||
|
|
||||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
|
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
|
||||||
mult(&phi(),&U(mu),&chi());
|
mult(&phi(),&U(mu),&chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
|
|||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
|
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
||||||
|
|
||||||
typedef GparityWilsonImplParams ImplParams;
|
typedef GparityWilsonImplParams ImplParams;
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||||
|
|
||||||
|
|
||||||
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
|
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
|
||||||
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
|
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
|
||||||
|
|
||||||
typedef SiteHalfSpinor vobj;
|
typedef SiteHalfSpinor vobj;
|
||||||
typedef typename SiteHalfSpinor::scalar_object sobj;
|
typedef typename SiteHalfSpinor::scalar_object sobj;
|
||||||
|
@ -109,7 +109,7 @@ namespace QCD {
|
|||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
|
void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -123,7 +123,7 @@ namespace QCD {
|
|||||||
FermionField Atilde(B._grid);
|
FermionField Atilde(B._grid);
|
||||||
Atilde = A;
|
Atilde = A;
|
||||||
|
|
||||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
|
st.HaloExchange(B,comm_buf,compressor);
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
st.HaloExchange(in,comm_buf,compressor);
|
||||||
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
|
@ -73,14 +73,14 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Extra methods added by derived
|
// Extra methods added by derived
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void DerivInternal(CartesianStencil & st,
|
void DerivInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
const FermionField &B,
|
const FermionField &B,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
|
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
|
|
||||||
|
|
||||||
@ -108,9 +108,9 @@ namespace Grid {
|
|||||||
GridBase * _cbgrid;
|
GridBase * _cbgrid;
|
||||||
|
|
||||||
//Defines the stencils for even and odd
|
//Defines the stencils for even and odd
|
||||||
CartesianStencil Stencil;
|
StencilImpl Stencil;
|
||||||
CartesianStencil StencilEven;
|
StencilImpl StencilEven;
|
||||||
CartesianStencil StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -70,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
||||||
|
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
|
commtime=0;
|
||||||
|
dslashtime=0;
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
@ -87,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
|||||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
||||||
|
|
||||||
Compressor compressor(DaggerNo);
|
Compressor compressor(DaggerNo);
|
||||||
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||||
|
|
||||||
int skip = (disp==1) ? 0 : 1;
|
int skip = (disp==1) ? 0 : 1;
|
||||||
|
|
||||||
@ -107,7 +109,7 @@ PARALLEL_FOR_LOOP
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
|
void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -124,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
|
|||||||
FermionField Btilde(B._grid);
|
FermionField Btilde(B._grid);
|
||||||
FermionField Atilde(B._grid);
|
FermionField Atilde(B._grid);
|
||||||
|
|
||||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
|
st.HaloExchange(B,comm_buf,compressor);
|
||||||
|
|
||||||
Atilde=A;
|
Atilde=A;
|
||||||
|
|
||||||
@ -196,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
|||||||
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
|
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::Report(void)
|
||||||
|
{
|
||||||
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -214,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -229,13 +252,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
|
|||||||
int cores = GridThread::GetCores();
|
int cores = GridThread::GetCores();
|
||||||
int nwork = U._grid->oSites();
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
|
commtime -=usecond();
|
||||||
|
st.HaloExchange(in,comm_buf,compressor);
|
||||||
|
commtime +=usecond();
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
// Not loop ordering and data layout.
|
// Not loop ordering and data layout.
|
||||||
// Designed to create
|
// Designed to create
|
||||||
// - per thread reuse in L1 cache for U
|
// - per thread reuse in L1 cache for U
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||||
|
dslashtime -=usecond();
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( this->HandOptDslash ) {
|
if( this->HandOptDslash ) {
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
@ -349,6 +375,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dslashtime +=usecond();
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||||
|
@ -32,7 +32,8 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
|
double commtime;
|
||||||
|
double dslashtime;
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -73,14 +74,14 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// New methods added
|
// New methods added
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void DerivInternal(CartesianStencil & st,
|
void DerivInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
const FermionField &B,
|
const FermionField &B,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(CartesianStencil & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -98,6 +99,7 @@ namespace Grid {
|
|||||||
// DoubleStore
|
// DoubleStore
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
void ImportGauge(const GaugeField &_Umu);
|
||||||
|
|
||||||
|
void Report(void);
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -113,9 +115,9 @@ namespace Grid {
|
|||||||
int Ls;
|
int Ls;
|
||||||
|
|
||||||
//Defines the stencils for even and odd
|
//Defines the stencils for even and odd
|
||||||
CartesianStencil Stencil;
|
StencilImpl Stencil;
|
||||||
CartesianStencil StencilEven;
|
StencilImpl StencilEven;
|
||||||
CartesianStencil StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -3,7 +3,7 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
|
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
|
||||||
{
|
{
|
||||||
|
@ -17,15 +17,15 @@ namespace Grid {
|
|||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out);
|
int sF,int sU,const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
||||||
#if defined(AVX512) || defined(IMCI)
|
#if defined(AVX512) || defined(IMCI)
|
||||||
@ -41,23 +41,23 @@ namespace Grid {
|
|||||||
#endif
|
#endif
|
||||||
#define HANDOPT
|
#define HANDOPT
|
||||||
#ifdef HANDOPT
|
#ifdef HANDOPT
|
||||||
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||||
}
|
}
|
||||||
|
|
||||||
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
@ -75,7 +75,7 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
|
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
|
||||||
{
|
{
|
||||||
|
@ -282,7 +282,7 @@ namespace QCD {
|
|||||||
|
|
||||||
#ifdef HANDOPT
|
#ifdef HANDOPT
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
@ -204,7 +204,7 @@ namespace Optimization {
|
|||||||
#if defined (AVX2)
|
#if defined (AVX2)
|
||||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -248,8 +248,8 @@ namespace Optimization {
|
|||||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2)
|
#if defined (AVX2)
|
||||||
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
|
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
||||||
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
|
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,275 +1,6 @@
|
|||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
CartesianStencil::CartesianStencil(GridBase *grid,
|
|
||||||
int npoints,
|
|
||||||
int checkerboard,
|
|
||||||
const std::vector<int> &directions,
|
|
||||||
const std::vector<int> &distances)
|
|
||||||
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
|
||||||
{
|
|
||||||
_npoints = npoints;
|
|
||||||
_grid = grid;
|
|
||||||
_directions = directions;
|
|
||||||
_distances = distances;
|
|
||||||
_unified_buffer_size=0;
|
|
||||||
_request_count =0;
|
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
|
||||||
|
|
||||||
for(int i=0;i<npoints;i++){
|
|
||||||
|
|
||||||
int point = i;
|
|
||||||
|
|
||||||
_entries[i].resize( osites);
|
|
||||||
|
|
||||||
int dimension = directions[i];
|
|
||||||
int displacement = distances[i];
|
|
||||||
int shift = displacement;
|
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
_permute_type[point]=_grid->PermuteType(dimension);
|
|
||||||
|
|
||||||
_checkerboard = checkerboard;
|
|
||||||
|
|
||||||
// the permute type
|
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
|
||||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
|
||||||
|
|
||||||
int sshift[2];
|
|
||||||
|
|
||||||
// Underlying approach. For each local site build
|
|
||||||
// up a table containing the npoint "neighbours" and whether they
|
|
||||||
// live in lattice or a comms buffer.
|
|
||||||
if ( !comm_dim ) {
|
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
|
||||||
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
|
||||||
Local(point,dimension,shift,0x3);
|
|
||||||
} else {
|
|
||||||
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
|
||||||
Local(point,dimension,shift,0x2);// both with block stride loop iteration
|
|
||||||
}
|
|
||||||
} else { // All permute extract done in comms phase prior to Stencil application
|
|
||||||
// So tables are the same whether comm_dim or splice_dim
|
|
||||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
|
||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
|
||||||
Comms(point,dimension,shift,0x3);
|
|
||||||
// std::cout<<"Comms 0x3"<<std::endl;
|
|
||||||
} else {
|
|
||||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
|
||||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
|
||||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// for(int ss=0;ss<osites;ss++){
|
|
||||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
|
||||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CartesianStencil::Local (int point, int dimension,int shiftpm,int cbmask)
|
|
||||||
{
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
int ld = _grid->_ldimensions[dimension];
|
|
||||||
int gd = _grid->_gdimensions[dimension];
|
|
||||||
|
|
||||||
// Map to always positive shift modulo global full dimension.
|
|
||||||
int shift = (shiftpm+fd)%fd;
|
|
||||||
|
|
||||||
// the permute type
|
|
||||||
int permute_dim =_grid->PermuteDim(dimension);
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
int o = 0;
|
|
||||||
int bo = x * _grid->_ostride[dimension];
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
|
||||||
|
|
||||||
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
|
|
||||||
int wraparound=0;
|
|
||||||
if ( (shiftpm==-1) && (sx>x) ) {
|
|
||||||
wraparound = 1;
|
|
||||||
}
|
|
||||||
if ( (shiftpm== 1) && (sx<x) ) {
|
|
||||||
wraparound = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int permute_slice=0;
|
|
||||||
if(permute_dim){
|
|
||||||
int wrap = sshift/rd;
|
|
||||||
int num = sshift%rd;
|
|
||||||
if ( x< rd-num ) permute_slice=wrap;
|
|
||||||
else permute_slice = 1-wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void CartesianStencil::Comms (int point,int dimension,int shiftpm,int cbmask)
|
|
||||||
{
|
|
||||||
GridBase *grid=_grid;
|
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
|
||||||
int ld = _grid->_ldimensions[dimension];
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
int pd = _grid->_processors[dimension];
|
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
|
||||||
|
|
||||||
// assert(simd_layout==1); // Why?
|
|
||||||
assert(comm_dim==1);
|
|
||||||
int shift = (shiftpm + fd) %fd;
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
|
||||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
|
||||||
// send to one or more remote nodes.
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
|
||||||
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
|
||||||
int offnode = (comm_proc!= 0);
|
|
||||||
|
|
||||||
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
|
|
||||||
int wraparound=0;
|
|
||||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
|
||||||
wraparound = 1;
|
|
||||||
}
|
|
||||||
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
|
|
||||||
wraparound = 1;
|
|
||||||
}
|
|
||||||
if (!offnode) {
|
|
||||||
|
|
||||||
int permute_slice=0;
|
|
||||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int words = buffer_size;
|
|
||||||
if (cbmask != 0x3) words=words>>1;
|
|
||||||
|
|
||||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
|
||||||
|
|
||||||
int rank = grid->_processor;
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
int unified_buffer_offset = _unified_buffer_size;
|
|
||||||
_unified_buffer_size += words;
|
|
||||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
|
||||||
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
|
|
||||||
{
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
|
|
||||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
|
||||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
|
||||||
_entries[point][lo+o+b]._is_local=1;
|
|
||||||
_entries[point][lo+o+b]._permute=permute;
|
|
||||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
|
||||||
}
|
|
||||||
o +=_grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
|
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
|
||||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
|
||||||
_entries[point][lo+o+b]._is_local=1;
|
|
||||||
_entries[point][lo+o+b]._permute=permute;
|
|
||||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
o +=_grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Routine builds up integer table for each site in _offsets, _is_local, _permute
|
|
||||||
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
|
|
||||||
{
|
|
||||||
int rd = _grid->_rdimensions[dimension];
|
|
||||||
|
|
||||||
if ( !_grid->CheckerBoarded(dimension) ) {
|
|
||||||
|
|
||||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
// Simple block stride gather of SIMD objects
|
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
|
||||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
|
||||||
_entries[point][so+o+b]._is_local=0;
|
|
||||||
_entries[point][so+o+b]._permute=0;
|
|
||||||
_entries[point][so+o+b]._around_the_world=wrap;
|
|
||||||
}
|
|
||||||
o +=_grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
|
|
||||||
int o = 0; // relative offset to base within plane
|
|
||||||
int bo = 0; // offset in buffer
|
|
||||||
|
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
|
||||||
|
|
||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
|
||||||
_entries[point][so+o+b]._is_local=0;
|
|
||||||
_entries[point][so+o+b]._permute =0;
|
|
||||||
_entries[point][so+o+b]._around_the_world=wrap;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
o +=_grid->_slice_stride[dimension];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
|
|||||||
template<class vobj> inline
|
template<class vobj> inline
|
||||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
|
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
|
||||||
{
|
{
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int s = Nsimd/Nextr;
|
int s = Nsimd/Nextr;
|
||||||
|
scalar_type * vp = (scalar_type *)&vec;
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nsimd);
|
|
||||||
for(int i=0;i<Nextr;i++) {
|
|
||||||
pointers[i] =(scalar_type *)& extracted[i][offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
vector_type *vp = (vector_type *)&vec;
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
extract<vector_type,scalar_type>(&vp[w],pointers,w);
|
for(int i=0;i<Nextr;i++){
|
||||||
|
scalar_type * pointer = (scalar_type *)& extracted[i][offset];
|
||||||
|
pointer[w] = vp[i*s+w*Nsimd];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nextr);
|
scalar_type *pointer;
|
||||||
for(int i=0;i<Nextr;i++)
|
scalar_type *vp = (scalar_type *)&vec;
|
||||||
pointers[i] =(scalar_type *)& extracted[i][offset];
|
|
||||||
|
|
||||||
vector_type *vp = (vector_type *)&vec;
|
|
||||||
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
merge<vector_type,scalar_type>(&vp[w],pointers,w);
|
for(int i=0;i<Nextr;i++){
|
||||||
|
for(int ii=0;ii<s;ii++){
|
||||||
|
pointer=(scalar_type *)&extracted[i][offset];
|
||||||
|
vp[w*Nsimd+i*s+ii] = pointer[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
|
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
|
||||||
|
|
||||||
|
|
||||||
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
|
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
|
||||||
|
Loading…
x
Reference in New Issue
Block a user