1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

Overlap comms compute changes

This commit is contained in:
paboyle 2016-01-10 19:20:16 +00:00
parent c99d748da6
commit d19321dfde
8 changed files with 220 additions and 184 deletions

View File

@ -7,8 +7,6 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -88,11 +86,78 @@ namespace Grid {
typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object; typedef typename cobj::scalar_object scalar_object;
//////////////////////////////////////////
// Comms packet queue for asynch thread
//////////////////////////////////////////
struct Packet {
void * send_buf;
void * recv_buf;
Integer to_rank;
Integer from_rank;
Integer bytes;
};
std::vector<Packet> Packets;
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
Packet p;
p.send_buf = xmit;
p.recv_buf = rcv;
p.to_rank = to;
p.from_rank= from;
p.bytes = bytes;
Packets.push_back(p);
}
void Communicate(void ) {
for(int i=0;i<Packets.size();i++){
_grid->SendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes);
}
}
///////////////////////////////////////////
// Simd merge queue for asynch comms
///////////////////////////////////////////
struct Merge {
cobj * mpointer;
std::vector<scalar_object *> rpointers;
Integer buffer_size;
};
std::vector<Merge> Mergers;
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size) {
Merge m;
m.mpointer = merge_p;
m.rpointers= rpointers;
m.buffer_size = buffer_size;
Mergers.push_back(m);
}
void CommsMerge(void ) {
mergetime-=usecond();
for(int i=0;i<Mergers.size();i++){
PARALLEL_FOR_LOOP
for(int o=0;o<Mergers[i].buffer_size;o++){
merge(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
}
}
mergetime+=usecond();
}
////////////////////////////////////////
// Basic Grid and stencil info
////////////////////////////////////////
int _checkerboard; int _checkerboard;
int _npoints; // Move to template param? int _npoints; // Move to template param?
GridBase * _grid; GridBase * _grid;
// npoints of these // npoints of these
std::vector<int> _directions; std::vector<int> _directions;
std::vector<int> _distances; std::vector<int> _distances;
@ -101,19 +166,21 @@ namespace Grid {
// npoints x Osites() of these // npoints x Osites() of these
std::vector<std::vector<StencilEntry> > _entries; std::vector<std::vector<StencilEntry> > _entries;
// Comms buffers
std::vector<std::vector<scalar_object> > send_buf_extract;
std::vector<std::vector<scalar_object> > recv_buf_extract;
std::vector<scalar_object *> pointers;
std::vector<scalar_object *> rpointers;
Vector<cobj> send_buf;
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; } inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
// Comms buffers
std::vector<Vector<scalar_object> > u_simd_send_buf;
std::vector<Vector<scalar_object> > u_simd_recv_buf;
Vector<cobj> u_send_buf;
Vector<cobj> comm_buf;
int u_comm_offset;
int _unified_buffer_size; int _unified_buffer_size;
int _request_count;
/////////////////////////////////////////
// Timing info; ugly; possibly temporary
/////////////////////////////////////////
#define TIMING_HACK
#ifdef TIMING_HACK
double buftime; double buftime;
double gathertime; double gathertime;
double commtime; double commtime;
@ -124,9 +191,7 @@ namespace Grid {
double gathermtime; double gathermtime;
double splicetime; double splicetime;
double nosplicetime; double nosplicetime;
#endif
CartesianStencil(GridBase *grid, CartesianStencil(GridBase *grid,
int npoints, int npoints,
@ -135,6 +200,7 @@ namespace Grid {
const std::vector<int> &distances) const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints) : _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{ {
#ifdef TIMING_HACK
gathertime=0; gathertime=0;
commtime=0; commtime=0;
commstime=0; commstime=0;
@ -145,13 +211,12 @@ namespace Grid {
buftime=0; buftime=0;
splicetime=0; splicetime=0;
nosplicetime=0; nosplicetime=0;
#endif
_npoints = npoints; _npoints = npoints;
_grid = grid; _grid = grid;
_directions = directions; _directions = directions;
_distances = distances; _distances = distances;
_unified_buffer_size=0; _unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites(); int osites = _grid->oSites();
@ -197,21 +262,24 @@ namespace Grid {
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout<<"Comms 0x3"<<std::endl;
Comms(point,dimension,shift,0x3); Comms(point,dimension,shift,0x3);
} else { } else {
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration Comms(point,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
} }
u_send_buf.resize(_unified_buffer_size);
comm_buf.resize(_unified_buffer_size);
const int Nsimd = grid->Nsimd();
u_simd_send_buf.resize(Nsimd);
u_simd_recv_buf.resize(Nsimd);
for(int l=0;l<Nsimd;l++){
u_simd_send_buf[l].resize(_unified_buffer_size);
u_simd_recv_buf[l].resize(_unified_buffer_size);
}
}
void Local (int point, int dimension,int shiftpm,int cbmask) void Local (int point, int dimension,int shiftpm,int cbmask)
{ {
@ -276,17 +344,15 @@ namespace Grid {
assert(shift<fd); assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
// std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes. // send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
int permute_type=grid->PermuteType(dimension); int permute_type=grid->PermuteType(dimension);
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
@ -310,16 +376,9 @@ namespace Grid {
} else { } else {
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
offnode = (comm_proc!= 0); offnode = (comm_proc!= 0);
// std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
} }
// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
// x+sshift = 4
// x+sshift/2 = 2
// 2%2 == 0
// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
int wraparound=0; int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1; wraparound = 1;
@ -337,15 +396,13 @@ namespace Grid {
int words = buffer_size; int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor; int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size; int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words; _unified_buffer_size += words;
// std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
} }
@ -441,39 +498,34 @@ namespace Grid {
} }
} }
// CartesianStencil(GridBase *grid,
// int npoints,
// int checkerboard,
// const std::vector<int> &directions,
// const std::vector<int> &distances);
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
Mergers.resize(0);
Packets.resize(0);
HaloGather(source,compress);
return std::thread([&] { this->Communicate(); });
}
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim void HaloExchange(const Lattice<vobj> &source,compressor &compress)
// Can this be avoided with simpler coding of comms? {
// void Local (int point, int dimension,int shift,int cbmask); auto thr = HaloExchangeBegin(source,compress);
// void Comms (int point, int dimension,int shift,int cbmask); HaloExchangeComplete(thr);
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap); }
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
void HaloExchangeComplete(std::thread &thr)
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{ {
std::thread thr = HaloExchangeBegin(source,u_comm_buf,compress);
thr.join(); thr.join();
CommsMerge();
} }
std::thread HaloExchangeBegin(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > & u_comm_buf,compressor &compress) { void HaloGather(const Lattice<vobj> &source,compressor &compress)
return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); });
}
void HaloExchangeBlocking(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{ {
// conformable(source._grid,_grid); // conformable(source._grid,_grid);
assert(source._grid==_grid); assert(source._grid==_grid);
halotime-=usecond(); halotime-=usecond();
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0; assert (comm_buf.size() == _unified_buffer_size );
u_comm_offset=0;
// Gather all comms buffers // Gather all comms buffers
for(int point = 0 ; point < _npoints; point++) { for(int point = 0 ; point < _npoints; point++) {
@ -506,35 +558,34 @@ namespace Grid {
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
if (splice_dim) { if (splice_dim) {
splicetime-=usecond(); splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); GatherSimd(source,dimension,shift,0x3,compress);
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); Gather(source,dimension,shift,0x3,compress);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} else { } else {
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if(splice_dim){ if(splice_dim){
splicetime-=usecond(); splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress); Gather(source,dimension,shift,0x1,compress);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress); Gather(source,dimension,shift,0x2,compress);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} }
} }
} }
assert(u_comm_offset==_unified_buffer_size);
halotime+=usecond(); halotime+=usecond();
} }
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask, void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress)
{ {
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_type scalar_type;
@ -555,8 +606,6 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -573,7 +622,7 @@ namespace Grid {
int bytes = words * sizeof(cobj); int bytes = words * sizeof(cobj);
gathertime-=usecond(); gathertime-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress); Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
gathertime+=usecond(); gathertime+=usecond();
int rank = _grid->_processor; int rank = _grid->_processor;
@ -585,11 +634,19 @@ namespace Grid {
// FIXME Implement asynchronous send & also avoid buffer copy // FIXME Implement asynchronous send & also avoid buffer copy
commtime-=usecond(); commtime-=usecond();
/*
_grid->SendToRecvFrom((void *)&send_buf[0], _grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&u_comm_buf[u_comm_offset], (void *)&comm_buf[u_comm_offset],
recv_from_rank, recv_from_rank,
bytes); bytes);
*/
AddPacket((void *)&u_send_buf[u_comm_offset],
(void *)&comm_buf[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
commtime+=usecond(); commtime+=usecond();
u_comm_offset+=words; u_comm_offset+=words;
@ -598,14 +655,11 @@ namespace Grid {
} }
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask, void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress)
{ {
buftime-=usecond(); buftime-=usecond();
const int Nsimd = _grid->Nsimd(); const int Nsimd = _grid->Nsimd();
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension]; int ld = _grid->_ldimensions[dimension];
@ -628,20 +682,11 @@ namespace Grid {
assert(cbmask==0x3); // Fixme think there is a latent bug if not true assert(cbmask==0x3); // Fixme think there is a latent bug if not true
// Should grow to max size and then cost very little thereafter
send_buf_extract.resize(Nsimd);
recv_buf_extract.resize(Nsimd);
for(int l=0;l<Nsimd;l++){
if( send_buf_extract[l].size() < buffer_size) {
send_buf_extract[l].resize(buffer_size);
recv_buf_extract[l].resize(buffer_size);
}
}
pointers.resize(Nsimd);
rpointers.resize(Nsimd);
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> rpointers(Nsimd);
std::vector<scalar_object *> spointers(Nsimd);
buftime+=usecond(); buftime+=usecond();
/////////////////////////////////////////// ///////////////////////////////////////////
@ -659,16 +704,19 @@ namespace Grid {
if ( any_offnode ) { if ( any_offnode ) {
for(int i=0;i<Nsimd;i++){ for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0]; spointers[i] = &u_simd_send_buf[i][u_comm_offset];
} }
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
gathermtime-=usecond(); gathermtime-=usecond();
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress); Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
gathermtime+=usecond(); gathermtime+=usecond();
for(int i=0;i<Nsimd;i++){ for(int i=0;i<Nsimd;i++){
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
int inner_bit = (Nsimd>>(permute_type+1)); int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0; int ic= (i&inner_bit)? 1:0;
@ -680,45 +728,43 @@ namespace Grid {
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer int nbr_ox = (nbr_lcoor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit)); int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit; if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox); assert (sx == nbr_ox);
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
void *vrp = (void *)rp;
void *vsp = (void *)sp;
if(nbr_proc){ if(nbr_proc){
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
commstime-=usecond(); commstime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
commstime+=usecond(); commstime+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = rp;
} else { } else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
rpointers[i] = sp;
} }
} }
// std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size <<" unified buffer size "<<_unified_buffer_size<<std::endl; AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size);
mergetime-=usecond();
PARALLEL_FOR_LOOP u_comm_offset +=buffer_size;
for(int i=0;i<buffer_size;i++){
// std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
// assert(u_comm_offset+i<_unified_buffer_size);
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
}
mergetime+=usecond();
u_comm_offset+=buffer_size;
} }
} }
} }
}; };
} }
#endif #endif

View File

@ -204,7 +204,6 @@ namespace Grid {
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
std::vector<siteVector,alignedAllocator<siteVector> > comm_buf;
/////////////////////// ///////////////////////
// Interface // Interface
@ -217,7 +216,7 @@ namespace Grid {
conformable(in._grid,out._grid); conformable(in._grid,out._grid);
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,comm_buf,compressor); Stencil.HaloExchange(in,compressor);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<Grid()->oSites();ss++){ for(int ss=0;ss<Grid()->oSites();ss++){
@ -234,7 +233,7 @@ PARALLEL_FOR_LOOP
} else if(SE->_is_local) { } else if(SE->_is_local) {
nbr = in._odata[SE->_offset]; nbr = in._odata[SE->_offset];
} else { } else {
nbr = comm_buf[SE->_offset]; nbr = Stencil.comm_buf[SE->_offset];
} }
res = res + A[point]._odata[ss]*nbr; res = res + A[point]._odata[ss]*nbr;
} }
@ -258,7 +257,6 @@ PARALLEL_FOR_LOOP
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid) A(geom.npoint,&CoarseGrid)
{ {
comm_buf.resize(Stencil._unified_buffer_size);
}; };
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,

View File

@ -44,7 +44,7 @@ public:
// Gather for when there is no need to SIMD split with compression // Gather for when there is no need to SIMD split with compression
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> void template<class vobj,class cobj,class compressor> void
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress) Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension]; int bo = n*rhs._grid->_slice_block[dimension];
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
} }
} }
} else { } else {
@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
} }
} }
} }

View File

@ -58,7 +58,6 @@ namespace QCD {
UmuOdd (&Hgrid) UmuOdd (&Hgrid)
{ {
// Allocate the required comms buffer // Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu); ImportGauge(_Umu);
} }
@ -153,7 +152,7 @@ namespace QCD {
FermionField Atilde(B._grid); FermionField Atilde(B._grid);
Atilde = A; Atilde = A;
st.HaloExchange(B,comm_buf,compressor); st.HaloExchange(B,compressor);
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
@ -168,7 +167,7 @@ namespace QCD {
//////////////////////// ////////////////////////
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<B._grid->oSites();sss++){ for(int sss=0;sss<B._grid->oSites();sss++){
Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma); Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
} }
////////////////////////////////////////////////// //////////////////////////////////////////////////
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag); Compressor compressor(dag);
Stencil.HaloExchange(in,comm_buf,compressor); Stencil.HaloExchange(in,compressor);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma); Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
} }
}; };
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
assert((dag==DaggerNo) ||(dag==DaggerYes)); assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
st.HaloExchange(in,comm_buf,compressor); st.HaloExchange(in,compressor);
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
} }
} }
} else { } else {
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
} }
} }
} }
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag); Compressor compressor(dag);
std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor); auto handle = st.HaloExchangeBegin(in,compressor);
comms_thread.join();
bool local = true; bool local = true;
bool nonlocal = false; bool nonlocal = false;
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} }
} else { } else {
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} }
} }
st.HaloExchangeComplete(handle);
local = false; local = false;
nonlocal = true; nonlocal = true;
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} }
} else { } else {
if( HandOptDslash ) { if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){ for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
} }
} }
} }

View File

@ -153,9 +153,6 @@ namespace Grid {
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
}; };
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;

View File

@ -98,12 +98,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
} }
// Allocate the required comms buffer // Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu); ImportGauge(_Umu);
commtime=0; commtime=0;
jointime=0; jointime=0;
dslashtime=0; dslashtime=0;
dslash1time=0;
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -121,7 +120,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t; // assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo); Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,comm_buf,compressor); Stencil.HaloExchange(in,compressor);
int skip = (disp==1) ? 0 : 1; int skip = (disp==1) ? 0 : 1;
@ -136,7 +135,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma); Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
} }
} }
}; };
@ -159,7 +158,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B._grid); FermionField Btilde(B._grid);
FermionField Atilde(B._grid); FermionField Atilde(B._grid);
st.HaloExchange(B,comm_buf,compressor); st.HaloExchange(B,compressor);
Atilde=A; Atilde=A;
@ -184,7 +183,7 @@ PARALLEL_FOR_LOOP
assert ( sF< B._grid->oSites()); assert ( sF< B._grid->oSites());
assert ( sU< U._grid->oSites()); assert ( sU< U._grid->oSites());
Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma); Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
//////////////////////////// ////////////////////////////
// spin trace outer product // spin trace outer product
@ -238,6 +237,7 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout<<GridLogMessage << "********************"<<std::endl; std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl; std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl; std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl; std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl; std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl; std::cout<<GridLogMessage << "********************"<<std::endl;
@ -299,11 +299,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int nwork = U._grid->oSites(); int nwork = U._grid->oSites();
commtime -=usecond(); commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor); auto handle = st.HaloExchangeBegin(in,compressor);
st.HaloExchangeComplete(handle);
commtime +=usecond(); commtime +=usecond();
jointime -=usecond(); jointime -=usecond();
thr.join();
jointime +=usecond(); jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -319,7 +319,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
} }
} }
} else { } else {
@ -330,7 +330,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){ for(sd=0;sd<Ls;sd++){
int sU=ss; int sU=ss;
int sF = sd+Ls*sU; int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
} }
} }
} }
@ -362,7 +362,7 @@ PARALLEL_FOR_LOOP
sU = lo.Reorder(sU); sU = lo.Reorder(sU);
} }
sF = s+Ls*sU; sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0] Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
} }
} }
} }
@ -387,7 +387,7 @@ PARALLEL_FOR_LOOP
sU=ss+ ssoff; sU=ss+ ssoff;
for(int s=soff;s<soff+swork;s++){ for(int s=soff;s<soff+swork;s++){
sF = s+Ls*sU; sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
} }
} }
} }
@ -398,7 +398,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
} }
} }
} else { } else {
@ -407,7 +407,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
} }
} }
} }
@ -432,7 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
int nwork = U._grid->oSites(); int nwork = U._grid->oSites();
commtime -=usecond(); commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor); auto handle = st.HaloExchangeBegin(in,compressor);
commtime +=usecond(); commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -450,7 +450,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} else { } else {
@ -461,7 +461,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){ for(sd=0;sd<Ls;sd++){
int sU=ss; int sU=ss;
int sF = sd+Ls*sU; int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} }
@ -473,7 +473,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} else { } else {
@ -482,7 +482,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} }
@ -490,12 +490,12 @@ PARALLEL_FOR_LOOP
dslashtime +=usecond(); dslashtime +=usecond();
jointime -=usecond(); jointime -=usecond();
thr.join(); st.HaloExchangeComplete(handle);
jointime +=usecond(); jointime +=usecond();
local = false; local = false;
nonlocal = true; nonlocal = true;
dslashtime -=usecond(); dslash1time -=usecond();
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
if( this->HandOptDslash ) { if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
@ -503,7 +503,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} else { } else {
@ -514,7 +514,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){ for(sd=0;sd<Ls;sd++){
int sU=ss; int sU=ss;
int sF = sd+Ls*sU; int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} }
@ -526,7 +526,7 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} else { } else {
@ -535,13 +535,12 @@ PARALLEL_FOR_LOOP
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal); Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
} }
} }
} }
} }
dslashtime +=usecond(); dslash1time +=usecond();
} }

View File

@ -64,6 +64,7 @@ namespace Grid {
double jointime; double jointime;
double commtime; double commtime;
double dslashtime; double dslashtime;
double dslash1time;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -99,9 +99,8 @@ int main (int argc, char ** argv)
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir]; ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
} }
std::vector<vobj,alignedAllocator<vobj> > comm_buf(myStencil._unified_buffer_size);
SimpleCompressor<vobj> compress; SimpleCompressor<vobj> compress;
myStencil.HaloExchange(Foo,comm_buf,compress); myStencil.HaloExchange(Foo,compress);
Bar = Cshift(Foo,dir,disp); Bar = Cshift(Foo,dir,disp);
@ -117,7 +116,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
Check._odata[i] = Foo._odata[SE->_offset]; Check._odata[i] = Foo._odata[SE->_offset];
else else
Check._odata[i] = comm_buf[SE->_offset]; Check._odata[i] = myStencil.comm_buf[SE->_offset];
} }
Real nrmC = norm2(Check); Real nrmC = norm2(Check);
@ -181,13 +180,10 @@ int main (int argc, char ** argv)
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir]; ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
} }
std::vector<vobj,alignedAllocator<vobj> > Ecomm_buf(EStencil._unified_buffer_size);
std::vector<vobj,alignedAllocator<vobj> > Ocomm_buf(OStencil._unified_buffer_size);
SimpleCompressor<vobj> compress; SimpleCompressor<vobj> compress;
EStencil.HaloExchange(EFoo,Ecomm_buf,compress); EStencil.HaloExchange(EFoo,compress);
OStencil.HaloExchange(OFoo,Ocomm_buf,compress); OStencil.HaloExchange(OFoo,compress);
Bar = Cshift(Foo,dir,disp); Bar = Cshift(Foo,dir,disp);
@ -211,7 +207,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
OCheck._odata[i] = EFoo._odata[SE->_offset]; OCheck._odata[i] = EFoo._odata[SE->_offset];
else else
OCheck._odata[i] = Ecomm_buf[SE->_offset]; OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
} }
for(int i=0;i<ECheck._grid->oSites();i++){ for(int i=0;i<ECheck._grid->oSites();i++){
int permute_type; int permute_type;
@ -224,7 +220,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local) else if (SE->_is_local)
ECheck._odata[i] = OFoo._odata[SE->_offset]; ECheck._odata[i] = OFoo._odata[SE->_offset];
else else
ECheck._odata[i] = Ocomm_buf[SE->_offset]; ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
} }
setCheckerboard(Check,ECheck); setCheckerboard(Check,ECheck);