1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 11:15:55 +01:00

Merge branch 'master' of github.com:paboyle/Grid

Conflicts:
	lib/qcd/action/fermion/WilsonFermion5D.cc
This commit is contained in:
paboyle 2015-11-06 03:50:04 -08:00
commit 899ca41cb8
20 changed files with 559 additions and 8450 deletions

View File

@ -93,12 +93,13 @@ int main (int argc, char ** argv)
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall; double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw"<<std::endl; std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
} }

8040
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(gmp.h) AC_CHECK_HEADERS(gmp.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

View File

@ -16,8 +16,9 @@
#include <iterator> #include <iterator>
#define __X86_64 #define __X86_64
#define EXECINFO
#ifdef EXECINFO
#ifdef HAVE_EXECINFO_H
#include <execinfo.h> #include <execinfo.h>
#endif #endif
@ -233,7 +234,9 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
printf(" mem address %llx\n",(unsigned long long)si->si_addr); printf(" mem address %llx\n",(unsigned long long)si->si_addr);
printf(" code %d\n",si->si_code); printf(" code %d\n",si->si_code);
#ifdef __X86_64 // Linux/Posix
#ifdef __linux__
// And x86 64bit
ucontext_t * uc= (ucontext_t *)ptr; ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
printf(" instruction %llx\n",(unsigned long long)sc->rip); printf(" instruction %llx\n",(unsigned long long)sc->rip);
@ -259,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
REG(r14); REG(r14);
REG(r15); REG(r15);
#endif #endif
#ifdef EXECINFO #ifdef HAVE_EXECINFO_H
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE); int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols); char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
for (int i = 0; i < symbols; i++){ for (int i = 0; i < symbols; i++){

View File

@ -48,10 +48,14 @@ namespace Grid {
int _around_the_world; int _around_the_world;
}; };
template<class vobj,class cobj, class compressor>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public: public:
typedef uint32_t StencilInteger; typedef uint32_t StencilInteger;
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int _checkerboard; int _checkerboard;
int _npoints; // Move to template param? int _npoints; // Move to template param?
@ -66,33 +70,334 @@ namespace Grid {
// npoints x Osites() of these // npoints x Osites() of these
std::vector<std::vector<StencilEntry> > _entries; std::vector<std::vector<StencilEntry> > _entries;
// Comms buffers
std::vector<std::vector<scalar_object> > send_buf_extract;
std::vector<std::vector<scalar_object> > recv_buf_extract;
std::vector<scalar_object *> pointers;
std::vector<scalar_object *> rpointers;
Vector<cobj> send_buf;
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; } inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
int _unified_buffer_size; int _unified_buffer_size;
int _request_count; int _request_count;
double buftime;
double gathertime;
double commtime;
double commstime;
double halotime;
double scattertime;
double mergetime;
double gathermtime;
double splicetime;
double nosplicetime;
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions, CartesianStencil(GridBase *grid,
const std::vector<int> &distances); int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{
gathertime=0;
commtime=0;
commstime=0;
halotime=0;
scattertime=0;
mergetime=0;
gathermtime=0;
buftime=0;
splicetime=0;
nosplicetime=0;
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl;
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
}
}
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
}
void Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// CartesianStencil(GridBase *grid,
// int npoints,
// int checkerboard,
// const std::vector<int> &directions,
// const std::vector<int> &distances);
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim // Add to tables for various cases; is this mistaken. only local if 1 proc in dim
// Can this be avoided with simpler coding of comms? // Can this be avoided with simpler coding of comms?
void Local (int point, int dimension,int shift,int cbmask); // void Local (int point, int dimension,int shift,int cbmask);
void Comms (int point, int dimension,int shift,int cbmask); // void Comms (int point, int dimension,int shift,int cbmask);
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap); // void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap); // void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
// Could allow a functional munging of the halo to another type during the comms. // Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression. // this could implement the 16bit/32bit/64bit compression.
template<class vobj,class cobj, class compressor> void void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{ {
// conformable(source._grid,_grid); // conformable(source._grid,_grid);
assert(source._grid==_grid); assert(source._grid==_grid);
halotime-=usecond();
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size); if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0; int u_comm_offset=0;
@ -124,27 +429,35 @@ namespace Grid {
if ( comm_dim ) { if ( comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
if (splice_dim) { if (splice_dim) {
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
splicetime+=usecond();
} else { } else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
} }
} else { } else {
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if(splice_dim){ if(splice_dim){
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
splicetime+=usecond();
} else { } else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress); GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress); GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
} }
} }
} }
} }
halotime+=usecond();
} }
template<class vobj,class cobj, class compressor>
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask, void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf, std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress) int &u_comm_offset,compressor & compress)
@ -168,8 +481,7 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm... if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -186,7 +498,9 @@ namespace Grid {
int bytes = words * sizeof(cobj); int bytes = words * sizeof(cobj);
gathertime-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
gathertime+=usecond();
int rank = _grid->_processor; int rank = _grid->_processor;
int recv_from_rank; int recv_from_rank;
@ -196,32 +510,27 @@ namespace Grid {
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy // FIXME Implement asynchronous send & also avoid buffer copy
commtime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf[0], _grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&u_comm_buf[u_comm_offset],
recv_from_rank, recv_from_rank,
bytes); bytes);
commtime+=usecond();
for(int i=0;i<words;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
// std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<" mask 0x"<<cbmask<<std::endl;
}
u_comm_offset+=words; u_comm_offset+=words;
} }
} }
} }
template<class vobj,class cobj, class compressor>
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask, void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf, std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress) int &u_comm_offset,compressor &compress)
{ {
buftime-=usecond();
const int Nsimd = _grid->Nsimd(); const int Nsimd = _grid->Nsimd();
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@ -244,17 +553,22 @@ namespace Grid {
int words = sizeof(cobj)/sizeof(vector_type); int words = sizeof(cobj)/sizeof(vector_type);
assert(cbmask==0x3); // Fixme think there is a latent bug if not true assert(cbmask==0x3); // Fixme think there is a latent bug if not true
/*
* possibly slow to allocate
* Doesn't matter in this test, but may want to preallocate in the
* dirac operators
*/
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); // // Should grow to max size and then cost very little thereafter
std::vector<scalar_object *> rpointers(Nsimd); // received pointers send_buf_extract.resize(Nsimd);
recv_buf_extract.resize(Nsimd);
for(int l=0;l<Nsimd;l++){
if( send_buf_extract[l].size() < buffer_size) {
send_buf_extract[l].resize(buffer_size);
recv_buf_extract[l].resize(buffer_size);
}
}
pointers.resize(Nsimd);
rpointers.resize(Nsimd);
int bytes = buffer_size*sizeof(scalar_object);
buftime+=usecond();
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where
@ -275,7 +589,9 @@ namespace Grid {
} }
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
gathermtime-=usecond();
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress); Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
gathermtime+=usecond();
for(int i=0;i<Nsimd;i++){ for(int i=0;i<Nsimd;i++){
@ -302,11 +618,13 @@ namespace Grid {
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
commstime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf_extract[i][0], (void *)&recv_buf_extract[i][0],
recv_from_rank, recv_from_rank,
bytes); bytes);
commstime+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
@ -316,11 +634,13 @@ namespace Grid {
} }
// Here we don't want to scatter, just place into a buffer. // Here we don't want to scatter, just place into a buffer.
mergetime-=usecond();
PARALLEL_FOR_LOOP
for(int i=0;i<buffer_size;i++){ for(int i=0;i<buffer_size;i++){
assert(u_comm_offset+i<_unified_buffer_size); // assert(u_comm_offset+i<_unified_buffer_size);
merge(u_comm_buf[u_comm_offset+i],rpointers,i); merge(u_comm_buf[u_comm_offset+i],rpointers,i);
} }
mergetime+=usecond();
u_comm_offset+=buffer_size; u_comm_offset+=buffer_size;
} }
} }

View File

@ -170,7 +170,7 @@ namespace Grid {
//////////////////// ////////////////////
Geometry geom; Geometry geom;
GridBase * _grid; GridBase * _grid;
CartesianStencil Stencil; CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;

View File

@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP21 if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ PARALLEL_NESTED_LOOP2
for(int b=0;b<e2;b++){ for(int n=0;n<e1;n++){
int o = n*rhs._grid->_slice_stride[dimension]; for(int b=0;b<e2;b++){
// int bo = n*rhs._grid->_slice_block[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int bo = n*rhs._grid->_slice_block[dimension];
if ( ocb &cbmask ) { buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
} }
} }
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
}
} }
} }
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
//PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ if ( cbmask ==0x3){
for(int b=0;b<e2;b++){ PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension]; int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
if ( ocb & cbmask ) {
cobj temp;
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset); extract<cobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
} }
} }
} }
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP2 if ( cbmask ==0x3 ) {
for(int n=0;n<e1;n++){ PARALLEL_NESTED_LOOP2
for(int b=0;b<e2;b++){ for(int n=0;n<e1;n++){
int o =n*rhs._grid->_slice_stride[dimension]; for(int b=0;b<e2;b++){
// int bo =n*rhs._grid->_slice_block[dimension]; int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int bo =n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) { rhs._odata[so+o+b]=buffer[bo+b];
rhs._odata[so+o+b]=buffer[bo++]; }
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
} }
} }
} }
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs._odata[so+o+b],pointers,offset);
} }
} }
} else {
assert(0); // think this is buggy FIXME
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
}
}
}
} }
} }
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask == 0x3 ){
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b; int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); //lhs._odata[lo+o]=rhs._odata[ro+o];
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]); vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
} }
}
} else {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
} }
} }

View File

@ -26,7 +26,7 @@ namespace Grid {
// and Methods: // and Methods:
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) // void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) // void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St) // void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) // void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu) // void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// //
@ -101,6 +101,7 @@ namespace Grid {
typedef typename Impl::SiteSpinor SiteSpinor; \ typedef typename Impl::SiteSpinor SiteSpinor; \
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
typedef typename Impl::Compressor Compressor; \ typedef typename Impl::Compressor Compressor; \
typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; typedef typename Impl::ImplParams ImplParams;
/////// ///////
@ -112,7 +113,6 @@ namespace Grid {
typedef ImplGauge<S,Nrepresentation> Gimpl; typedef ImplGauge<S,Nrepresentation> Gimpl;
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >; template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@ -128,10 +128,11 @@ namespace Grid {
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor; typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams; typedef WilsonImplParams ImplParams;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
ImplParams Params; ImplParams Params;
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){ inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
mult(&phi(),&U(mu),&chi()); mult(&phi(),&U(mu),&chi());
} }
@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor; typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
typedef GparityWilsonImplParams ImplParams; typedef GparityWilsonImplParams ImplParams;
ImplParams Params; ImplParams Params;
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){ inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
typedef SiteHalfSpinor vobj; typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj; typedef typename SiteHalfSpinor::scalar_object sobj;

View File

@ -109,7 +109,7 @@ namespace QCD {
/////////////////////////////////// ///////////////////////////////////
template<class Impl> template<class Impl>
void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st, void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U, DoubledGaugeField & U,
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &A,
@ -123,7 +123,7 @@ namespace QCD {
FermionField Atilde(B._grid); FermionField Atilde(B._grid);
Atilde = A; Atilde = A;
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor); st.HaloExchange(B,comm_buf,compressor);
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag); Compressor compressor(dag);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor); Stencil.HaloExchange(in,comm_buf,compressor);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
template<class Impl> template<class Impl>
void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U, void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) { const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes)); assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor); st.HaloExchange(in,comm_buf,compressor);
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
if( HandOptDslash ) { if( HandOptDslash ) {

View File

@ -73,14 +73,14 @@ namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Extra methods added by derived // Extra methods added by derived
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st, void DerivInternal(StencilImpl & st,
DoubledGaugeField & U, DoubledGaugeField & U,
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag); int dag);
void DhopInternal(CartesianStencil & st,DoubledGaugeField & U, void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ; const FermionField &in, FermionField &out,int dag) ;
@ -108,9 +108,9 @@ namespace Grid {
GridBase * _cbgrid; GridBase * _cbgrid;
//Defines the stencils for even and odd //Defines the stencils for even and odd
CartesianStencil Stencil; StencilImpl Stencil;
CartesianStencil StencilEven; StencilImpl StencilEven;
CartesianStencil StencilOdd; StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@ -70,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu); ImportGauge(_Umu);
commtime=0;
dslashtime=0;
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -87,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t; // assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo); Compressor compressor(DaggerNo);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor); Stencil.HaloExchange(in,comm_buf,compressor);
int skip = (disp==1) ? 0 : 1; int skip = (disp==1) ? 0 : 1;
@ -107,7 +109,7 @@ PARALLEL_FOR_LOOP
}; };
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st, void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U, DoubledGaugeField & U,
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &A,
@ -124,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
FermionField Btilde(B._grid); FermionField Btilde(B._grid);
FermionField Atilde(B._grid); FermionField Atilde(B._grid);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor); st.HaloExchange(B,comm_buf,compressor);
Atilde=A; Atilde=A;
@ -196,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag); DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
} }
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat, void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A, const FermionField &A,
@ -214,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
@ -229,13 +252,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
int cores = GridThread::GetCores(); int cores = GridThread::GetCores();
int nwork = U._grid->oSites(); int nwork = U._grid->oSites();
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor); commtime -=usecond();
st.HaloExchange(in,comm_buf,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout. // Not loop ordering and data layout.
// Designed to create // Designed to create
// - per thread reuse in L1 cache for U // - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
if( this->HandOptDslash ) { if( this->HandOptDslash ) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -349,6 +375,7 @@ PARALLEL_FOR_LOOP
} }
} }
} }
dslashtime +=usecond();
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)

View File

@ -32,7 +32,8 @@ namespace Grid {
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
double commtime;
double dslashtime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -73,14 +74,14 @@ namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// New methods added // New methods added
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st, void DerivInternal(StencilImpl & st,
DoubledGaugeField & U, DoubledGaugeField & U,
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &A,
const FermionField &B, const FermionField &B,
int dag); int dag);
void DhopInternal(CartesianStencil & st, void DhopInternal(StencilImpl & st,
LebesgueOrder &lo, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
@ -98,6 +99,7 @@ namespace Grid {
// DoubleStore // DoubleStore
void ImportGauge(const GaugeField &_Umu); void ImportGauge(const GaugeField &_Umu);
void Report(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Data members require to support the functionality // Data members require to support the functionality
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -113,9 +115,9 @@ namespace Grid {
int Ls; int Ls;
//Defines the stencils for even and odd //Defines the stencils for even and odd
CartesianStencil Stencil; StencilImpl Stencil;
CartesianStencil StencilEven; StencilImpl StencilEven;
CartesianStencil StencilOdd; StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@ -3,7 +3,7 @@ namespace Grid {
namespace QCD { namespace QCD {
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
}; };
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma) int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
{ {

View File

@ -17,15 +17,15 @@ namespace Grid {
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
public: public:
void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U, void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out); int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U, void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out); int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U, void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma); int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
#if defined(AVX512) || defined(IMCI) #if defined(AVX512) || defined(IMCI)
@ -41,23 +41,23 @@ namespace Grid {
#endif #endif
#define HANDOPT #define HANDOPT
#ifdef HANDOPT #ifdef HANDOPT
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U, void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out); int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U, void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out); int sF,int sU,const FermionField &in, FermionField &out);
#else #else
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U, void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3 DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
} }
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U, void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out) int sF,int sU,const FermionField &in, FermionField &out)
{ {

View File

@ -75,7 +75,7 @@ namespace Grid {
namespace QCD { namespace QCD {
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers) int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
{ {

View File

@ -282,7 +282,7 @@ namespace QCD {
#ifdef HANDOPT #ifdef HANDOPT
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {
@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
} }
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U, void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out) int ss,int sU,const FermionField &in, FermionField &out)
{ {

View File

@ -204,7 +204,7 @@ namespace Optimization {
#if defined (AVX2) #if defined (AVX2)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
} }
@ -248,8 +248,8 @@ namespace Optimization {
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
#if defined (AVX2) #if defined (AVX2)
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif

View File

@ -1,275 +1,6 @@
#include "Grid.h" #include "Grid.h"
namespace Grid { namespace Grid {
CartesianStencil::CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl;
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
}
}
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
}
void CartesianStencil::Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void CartesianStencil::Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
} }

View File

@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
template<class vobj> inline template<class vobj> inline
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset) void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
{ {
typedef typename vobj::scalar_type scalar_type ; typedef typename vobj::scalar_type scalar_type ;
typedef typename vobj::vector_type vector_type ; typedef typename vobj::vector_type vector_type ;
const int words=sizeof(vobj)/sizeof(vector_type); const int words=sizeof(vobj)/sizeof(vector_type);
const int Nsimd=vobj::vector_type::Nsimd(); const int Nsimd=vobj::vector_type::Nsimd();
int Nextr=extracted.size(); int Nextr=extracted.size();
int s = Nsimd/Nextr; int s = Nsimd/Nextr;
scalar_type * vp = (scalar_type *)&vec;
std::vector<scalar_type *> pointers(Nsimd);
for(int i=0;i<Nextr;i++) {
pointers[i] =(scalar_type *)& extracted[i][offset];
}
vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
extract<vector_type,scalar_type>(&vp[w],pointers,w); for(int i=0;i<Nextr;i++){
scalar_type * pointer = (scalar_type *)& extracted[i][offset];
pointer[w] = vp[i*s+w*Nsimd];
}
} }
} }
@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
const int words=sizeof(vobj)/sizeof(vector_type); const int words=sizeof(vobj)/sizeof(vector_type);
int Nextr=extracted.size(); int Nextr=extracted.size();
int s=Nsimd/Nextr;
std::vector<scalar_type *> pointers(Nextr); scalar_type *pointer;
for(int i=0;i<Nextr;i++) scalar_type *vp = (scalar_type *)&vec;
pointers[i] =(scalar_type *)& extracted[i][offset];
vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
merge<vector_type,scalar_type>(&vp[w],pointers,w); for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){
pointer=(scalar_type *)&extracted[i][offset];
vp[w*Nsimd+i*s+ii] = pointer[w];
}
}
} }
} }
} }
#endif #endif

View File

@ -1,5 +1,5 @@
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
Test_GaugeAction_SOURCES=Test_GaugeAction.cc Test_GaugeAction_SOURCES=Test_GaugeAction.cc