1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Merge branch 'master' of github.com:paboyle/Grid

Conflicts:
	lib/qcd/action/fermion/WilsonFermion5D.cc
This commit is contained in:
paboyle 2015-11-06 03:50:04 -08:00
commit 899ca41cb8
20 changed files with 559 additions and 8450 deletions

View File

@ -93,12 +93,13 @@ int main (int argc, char ** argv)
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
}

8040
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(gmp.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

View File

@ -16,8 +16,9 @@
#include <iterator>
#define __X86_64
#define EXECINFO
#ifdef EXECINFO
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
@ -233,7 +234,9 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
printf(" mem address %llx\n",(unsigned long long)si->si_addr);
printf(" code %d\n",si->si_code);
#ifdef __X86_64
// Linux/Posix
#ifdef __linux__
// And x86 64bit
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
printf(" instruction %llx\n",(unsigned long long)sc->rip);
@ -259,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
REG(r14);
REG(r15);
#endif
#ifdef EXECINFO
#ifdef HAVE_EXECINFO_H
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
for (int i = 0; i < symbols; i++){

View File

@ -48,10 +48,14 @@ namespace Grid {
int _around_the_world;
};
template<class vobj,class cobj, class compressor>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
typedef uint32_t StencilInteger;
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int _checkerboard;
int _npoints; // Move to template param?
@ -66,33 +70,334 @@ namespace Grid {
// npoints x Osites() of these
std::vector<std::vector<StencilEntry> > _entries;
// Comms buffers
std::vector<std::vector<scalar_object> > send_buf_extract;
std::vector<std::vector<scalar_object> > recv_buf_extract;
std::vector<scalar_object *> pointers;
std::vector<scalar_object *> rpointers;
Vector<cobj> send_buf;
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
int _unified_buffer_size;
int _request_count;
double buftime;
double gathertime;
double commtime;
double commstime;
double halotime;
double scattertime;
double mergetime;
double gathermtime;
double splicetime;
double nosplicetime;
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances);
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{
gathertime=0;
commtime=0;
commstime=0;
halotime=0;
scattertime=0;
mergetime=0;
gathermtime=0;
buftime=0;
splicetime=0;
nosplicetime=0;
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl;
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
}
}
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
}
void Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// CartesianStencil(GridBase *grid,
// int npoints,
// int checkerboard,
// const std::vector<int> &directions,
// const std::vector<int> &distances);
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
// Can this be avoided with simpler coding of comms?
void Local (int point, int dimension,int shift,int cbmask);
void Comms (int point, int dimension,int shift,int cbmask);
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
// void Local (int point, int dimension,int shift,int cbmask);
// void Comms (int point, int dimension,int shift,int cbmask);
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj,class cobj, class compressor> void
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
halotime-=usecond();
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0;
@ -124,27 +429,35 @@ namespace Grid {
if ( comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
splicetime+=usecond();
} else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
}
} else {
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if(splice_dim){
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
splicetime+=usecond();
} else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
}
}
}
}
halotime+=usecond();
}
template<class vobj,class cobj, class compressor>
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress)
@ -168,8 +481,7 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -186,7 +498,9 @@ namespace Grid {
int bytes = words * sizeof(cobj);
gathertime-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
gathertime+=usecond();
int rank = _grid->_processor;
int recv_from_rank;
@ -196,32 +510,27 @@ namespace Grid {
assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy
commtime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
(void *)&u_comm_buf[u_comm_offset],
recv_from_rank,
bytes);
commtime+=usecond();
for(int i=0;i<words;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
// std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<" mask 0x"<<cbmask<<std::endl;
}
u_comm_offset+=words;
}
}
}
template<class vobj,class cobj, class compressor>
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress)
{
buftime-=usecond();
const int Nsimd = _grid->Nsimd();
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -244,17 +553,22 @@ namespace Grid {
int words = sizeof(cobj)/sizeof(vector_type);
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
/*
* possibly slow to allocate
* Doesn't matter in this test, but may want to preallocate in the
* dirac operators
*/
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
// Should grow to max size and then cost very little thereafter
send_buf_extract.resize(Nsimd);
recv_buf_extract.resize(Nsimd);
for(int l=0;l<Nsimd;l++){
if( send_buf_extract[l].size() < buffer_size) {
send_buf_extract[l].resize(buffer_size);
recv_buf_extract[l].resize(buffer_size);
}
}
pointers.resize(Nsimd);
rpointers.resize(Nsimd);
int bytes = buffer_size*sizeof(scalar_object);
buftime+=usecond();
///////////////////////////////////////////
// Work out what to send where
@ -275,7 +589,9 @@ namespace Grid {
}
int sx = (x+sshift)%rd;
gathermtime-=usecond();
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
gathermtime+=usecond();
for(int i=0;i<Nsimd;i++){
@ -302,11 +618,13 @@ namespace Grid {
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
commstime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
commstime+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
@ -316,11 +634,13 @@ namespace Grid {
}
// Here we don't want to scatter, just place into a buffer.
mergetime-=usecond();
PARALLEL_FOR_LOOP
for(int i=0;i<buffer_size;i++){
assert(u_comm_offset+i<_unified_buffer_size);
// assert(u_comm_offset+i<_unified_buffer_size);
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
}
mergetime+=usecond();
u_comm_offset+=buffer_size;
}
}

View File

@ -170,7 +170,7 @@ namespace Grid {
////////////////////
Geometry geom;
GridBase * _grid;
CartesianStencil Stencil;
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
std::vector<CoarseMatrix> A;

View File

@ -29,17 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP21
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
// int bo = n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
if ( cbmask == 0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
}
}
}
@ -60,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
//PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
if ( cbmask ==0x3){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb & cbmask ) {
cobj temp;
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
}
}
}
@ -110,15 +135,26 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int bo=0;
//PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
// int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
if ( cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
}
}
}
@ -139,16 +175,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
}
}
} else {
assert(0); // think this is buggy FIXME
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
}
}
}
}
}
@ -168,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
if(cbmask == 0x3 ){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
int o =n*rhs._grid->_slice_stride[dimension]+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
} else {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
}
}

View File

@ -26,7 +26,7 @@ namespace Grid {
// and Methods:
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
//
@ -101,6 +101,7 @@ namespace Grid {
typedef typename Impl::SiteSpinor SiteSpinor; \
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
typedef typename Impl::Compressor Compressor; \
typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams;
///////
@ -112,7 +113,6 @@ namespace Grid {
typedef ImplGauge<S,Nrepresentation> Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@ -128,10 +128,11 @@ namespace Grid {
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
ImplParams Params;
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
mult(&phi(),&U(mu),&chi());
}
@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
typedef GparityWilsonImplParams ImplParams;
ImplParams Params;
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj;

View File

@ -109,7 +109,7 @@ namespace QCD {
///////////////////////////////////
template<class Impl>
void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
@ -123,7 +123,7 @@ namespace QCD {
FermionField Atilde(B._grid);
Atilde = A;
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
st.HaloExchange(B,comm_buf,compressor);
for(int mu=0;mu<Nd;mu++){
@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
Stencil.HaloExchange(in,comm_buf,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
template<class Impl>
void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
st.HaloExchange(in,comm_buf,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {

View File

@ -73,14 +73,14 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st,
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
@ -108,9 +108,9 @@ namespace Grid {
GridBase * _cbgrid;
//Defines the stencils for even and odd
CartesianStencil Stencil;
CartesianStencil StencilEven;
CartesianStencil StencilOdd;
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@ -70,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
commtime=0;
dslashtime=0;
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -87,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
Stencil.HaloExchange(in,comm_buf,compressor);
int skip = (disp==1) ? 0 : 1;
@ -107,7 +109,7 @@ PARALLEL_FOR_LOOP
};
template<class Impl>
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
@ -124,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
st.HaloExchange(B,comm_buf,compressor);
Atilde=A;
@ -196,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
@ -214,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
@ -229,13 +252,16 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
int cores = GridThread::GetCores();
int nwork = U._grid->oSites();
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
commtime -=usecond();
st.HaloExchange(in,comm_buf,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
#pragma omp parallel for schedule(static)
@ -349,6 +375,7 @@ PARALLEL_FOR_LOOP
}
}
}
dslashtime +=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)

View File

@ -32,7 +32,8 @@ namespace Grid {
public:
INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels;
double commtime;
double dslashtime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
@ -73,14 +74,14 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// New methods added
///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st,
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(CartesianStencil & st,
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
@ -98,6 +99,7 @@ namespace Grid {
// DoubleStore
void ImportGauge(const GaugeField &_Umu);
void Report(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
@ -113,9 +115,9 @@ namespace Grid {
int Ls;
//Defines the stencils for even and odd
CartesianStencil Stencil;
CartesianStencil StencilEven;
CartesianStencil StencilOdd;
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@ -3,7 +3,7 @@ namespace Grid {
namespace QCD {
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
};
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
}
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
{

View File

@ -17,15 +17,15 @@ namespace Grid {
typedef FermionOperator<Impl> Base;
public:
void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
#if defined(AVX512) || defined(IMCI)
@ -41,23 +41,23 @@ namespace Grid {
#endif
#define HANDOPT
#ifdef HANDOPT
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
#else
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{

View File

@ -75,7 +75,7 @@ namespace Grid {
namespace QCD {
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
{

View File

@ -282,7 +282,7 @@ namespace QCD {
#ifdef HANDOPT
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
}
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{

View File

@ -204,7 +204,7 @@ namespace Optimization {
#if defined (AVX2)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
@ -248,8 +248,8 @@ namespace Optimization {
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2)
__m256d a_real = _mm256_moveldup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_movehdup_pd( a ); // Ai Ai
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif

View File

@ -1,275 +1,6 @@
#include "Grid.h"
namespace Grid {
CartesianStencil::CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl;
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
}
}
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
}
void CartesianStencil::Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void CartesianStencil::Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
}

View File

@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
template<class vobj> inline
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
{
typedef typename vobj::scalar_type scalar_type ;
typedef typename vobj::vector_type vector_type ;
const int words=sizeof(vobj)/sizeof(vector_type);
const int Nsimd=vobj::vector_type::Nsimd();
int Nextr=extracted.size();
int s = Nsimd/Nextr;
scalar_type * vp = (scalar_type *)&vec;
std::vector<scalar_type *> pointers(Nsimd);
for(int i=0;i<Nextr;i++) {
pointers[i] =(scalar_type *)& extracted[i][offset];
}
vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){
extract<vector_type,scalar_type>(&vp[w],pointers,w);
for(int i=0;i<Nextr;i++){
scalar_type * pointer = (scalar_type *)& extracted[i][offset];
pointer[w] = vp[i*s+w*Nsimd];
}
}
}
@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
const int words=sizeof(vobj)/sizeof(vector_type);
int Nextr=extracted.size();
int s=Nsimd/Nextr;
std::vector<scalar_type *> pointers(Nextr);
for(int i=0;i<Nextr;i++)
pointers[i] =(scalar_type *)& extracted[i][offset];
vector_type *vp = (vector_type *)&vec;
scalar_type *pointer;
scalar_type *vp = (scalar_type *)&vec;
for(int w=0;w<words;w++){
merge<vector_type,scalar_type>(&vp[w],pointers,w);
for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){
pointer=(scalar_type *)&extracted[i][offset];
vp[w*Nsimd+i*s+ii] = pointer[w];
}
}
}
}
}
}
#endif

View File

@ -1,5 +1,5 @@
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
Test_GaugeAction_SOURCES=Test_GaugeAction.cc