mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
GparityWilsonTM typedef added. Not yet tested
Conflicts: configure lib/qcd/action/fermion/WilsonKernels.h
This commit is contained in:
commit
411ac49dd7
@ -116,7 +116,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
typename DomainWallFermionR::ImplParams params;
|
typename DomainWallFermionR::ImplParams params;
|
||||||
params.overlapCommsCompute = overlapComms;
|
params.overlapCommsCompute = overlapComms;
|
||||||
|
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
@ -136,6 +137,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
Dw.Report();
|
Dw.Report();
|
||||||
@ -193,6 +195,7 @@ int main (int argc, char ** argv)
|
|||||||
double flops=(1344.0*volume*ncall)/2;
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
@ -137,9 +137,6 @@
|
|||||||
/* Define to the one symbol short name of this package. */
|
/* Define to the one symbol short name of this package. */
|
||||||
#undef PACKAGE_TARNAME
|
#undef PACKAGE_TARNAME
|
||||||
|
|
||||||
/* Define to the home page for this package. */
|
|
||||||
#undef PACKAGE_URL
|
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
/* Define to the version of this package. */
|
||||||
#undef PACKAGE_VERSION
|
#undef PACKAGE_VERSION
|
||||||
|
|
||||||
|
324
lib/Stencil.h
324
lib/Stencil.h
@ -7,8 +7,6 @@
|
|||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -88,11 +86,95 @@ namespace Grid {
|
|||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
typedef typename cobj::scalar_object scalar_object;
|
typedef typename cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// Comms packet queue for asynch thread
|
||||||
|
//////////////////////////////////////////
|
||||||
|
|
||||||
|
struct Packet {
|
||||||
|
void * send_buf;
|
||||||
|
void * recv_buf;
|
||||||
|
Integer to_rank;
|
||||||
|
Integer from_rank;
|
||||||
|
Integer bytes;
|
||||||
|
volatile Integer done;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<Packet> Packets;
|
||||||
|
|
||||||
|
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
||||||
|
Packet p;
|
||||||
|
p.send_buf = xmit;
|
||||||
|
p.recv_buf = rcv;
|
||||||
|
p.to_rank = to;
|
||||||
|
p.from_rank= from;
|
||||||
|
p.bytes = bytes;
|
||||||
|
p.done = 0;
|
||||||
|
comms_bytes+=2.0*bytes;
|
||||||
|
Packets.push_back(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Communicate(void ) {
|
||||||
|
commtime-=usecond();
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
_grid->SendToRecvFrom(Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes);
|
||||||
|
Packets[i].done = 1;
|
||||||
|
}
|
||||||
|
commtime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Simd merge queue for asynch comms
|
||||||
|
///////////////////////////////////////////
|
||||||
|
struct Merge {
|
||||||
|
cobj * mpointer;
|
||||||
|
std::vector<scalar_object *> rpointers;
|
||||||
|
Integer buffer_size;
|
||||||
|
Integer packet_id;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<Merge> Mergers;
|
||||||
|
|
||||||
|
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
|
||||||
|
Merge m;
|
||||||
|
m.mpointer = merge_p;
|
||||||
|
m.rpointers= rpointers;
|
||||||
|
m.buffer_size = buffer_size;
|
||||||
|
m.packet_id = packet_id;
|
||||||
|
Mergers.push_back(m);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommsMerge(void ) {
|
||||||
|
//PARALLEL_NESTED_LOOP2
|
||||||
|
for(int i=0;i<Mergers.size();i++){
|
||||||
|
|
||||||
|
|
||||||
|
spintime-=usecond();
|
||||||
|
int packet_id = Mergers[i].packet_id;
|
||||||
|
while(! Packets[packet_id].done ); // spin for completion
|
||||||
|
spintime+=usecond();
|
||||||
|
|
||||||
|
mergetime-=usecond();
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
|
}
|
||||||
|
mergetime+=usecond();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Basic Grid and stencil info
|
||||||
|
////////////////////////////////////////
|
||||||
|
|
||||||
int _checkerboard;
|
int _checkerboard;
|
||||||
int _npoints; // Move to template param?
|
int _npoints; // Move to template param?
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
|
||||||
|
|
||||||
// npoints of these
|
// npoints of these
|
||||||
std::vector<int> _directions;
|
std::vector<int> _directions;
|
||||||
std::vector<int> _distances;
|
std::vector<int> _distances;
|
||||||
@ -101,32 +183,32 @@ namespace Grid {
|
|||||||
|
|
||||||
// npoints x Osites() of these
|
// npoints x Osites() of these
|
||||||
std::vector<std::vector<StencilEntry> > _entries;
|
std::vector<std::vector<StencilEntry> > _entries;
|
||||||
|
|
||||||
// Comms buffers
|
|
||||||
std::vector<std::vector<scalar_object> > send_buf_extract;
|
|
||||||
std::vector<std::vector<scalar_object> > recv_buf_extract;
|
|
||||||
std::vector<scalar_object *> pointers;
|
|
||||||
std::vector<scalar_object *> rpointers;
|
|
||||||
Vector<cobj> send_buf;
|
|
||||||
|
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
||||||
|
|
||||||
|
// Comms buffers
|
||||||
|
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
||||||
|
std::vector<Vector<scalar_object> > u_simd_recv_buf;
|
||||||
|
Vector<cobj> u_send_buf;
|
||||||
|
Vector<cobj> comm_buf;
|
||||||
|
int u_comm_offset;
|
||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
int _request_count;
|
|
||||||
|
|
||||||
double buftime;
|
/////////////////////////////////////////
|
||||||
|
// Timing info; ugly; possibly temporary
|
||||||
|
/////////////////////////////////////////
|
||||||
|
#define TIMING_HACK
|
||||||
|
#ifdef TIMING_HACK
|
||||||
|
double jointime;
|
||||||
double gathertime;
|
double gathertime;
|
||||||
double commtime;
|
double commtime;
|
||||||
double commstime;
|
double halogtime;
|
||||||
double halotime;
|
|
||||||
double scattertime;
|
|
||||||
double mergetime;
|
double mergetime;
|
||||||
|
double spintime;
|
||||||
|
double comms_bytes;
|
||||||
double gathermtime;
|
double gathermtime;
|
||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
@ -135,28 +217,29 @@ namespace Grid {
|
|||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
||||||
{
|
{
|
||||||
|
#ifdef TIMING_HACK
|
||||||
gathertime=0;
|
gathertime=0;
|
||||||
|
jointime=0;
|
||||||
commtime=0;
|
commtime=0;
|
||||||
commstime=0;
|
halogtime=0;
|
||||||
halotime=0;
|
|
||||||
scattertime=0;
|
|
||||||
mergetime=0;
|
mergetime=0;
|
||||||
|
spintime=0;
|
||||||
gathermtime=0;
|
gathermtime=0;
|
||||||
buftime=0;
|
|
||||||
splicetime=0;
|
splicetime=0;
|
||||||
nosplicetime=0;
|
nosplicetime=0;
|
||||||
|
comms_bytes=0;
|
||||||
|
#endif
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
_directions = directions;
|
_directions = directions;
|
||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
_request_count =0;
|
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
for(int i=0;i<npoints;i++){
|
for(int ii=0;ii<npoints;ii++){
|
||||||
|
|
||||||
|
int i = ii; // reverse direction to get SIMD comms done first
|
||||||
int point = i;
|
int point = i;
|
||||||
|
|
||||||
_entries[i].resize( osites);
|
_entries[i].resize( osites);
|
||||||
@ -197,22 +280,25 @@ namespace Grid {
|
|||||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||||
|
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
// std::cout<<"Comms 0x3"<<std::endl;
|
|
||||||
Comms(point,dimension,shift,0x3);
|
Comms(point,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
|
||||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// for(int ss=0;ss<osites;ss++){
|
}
|
||||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
u_send_buf.resize(_unified_buffer_size);
|
||||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
comm_buf.resize(_unified_buffer_size);
|
||||||
// }
|
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
u_simd_send_buf.resize(Nsimd);
|
||||||
|
u_simd_recv_buf.resize(Nsimd);
|
||||||
|
for(int l=0;l<Nsimd;l++){
|
||||||
|
u_simd_send_buf[l].resize(_unified_buffer_size);
|
||||||
|
u_simd_recv_buf[l].resize(_unified_buffer_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void Local (int point, int dimension,int shiftpm,int cbmask)
|
void Local (int point, int dimension,int shiftpm,int cbmask)
|
||||||
{
|
{
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
@ -276,17 +362,15 @@ namespace Grid {
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
|
||||||
// std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
|
|
||||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||||
// send to one or more remote nodes.
|
// send to one or more remote nodes.
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
|
||||||
int permute_type=grid->PermuteType(dimension);
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
@ -310,16 +394,9 @@ namespace Grid {
|
|||||||
} else {
|
} else {
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
offnode = (comm_proc!= 0);
|
offnode = (comm_proc!= 0);
|
||||||
// std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
|
|
||||||
// x+sshift = 4
|
|
||||||
// x+sshift/2 = 2
|
|
||||||
// 2%2 == 0
|
|
||||||
// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
|
|
||||||
|
|
||||||
int wraparound=0;
|
int wraparound=0;
|
||||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||||
wraparound = 1;
|
wraparound = 1;
|
||||||
@ -337,15 +414,13 @@ namespace Grid {
|
|||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
// GatherPlaneSimple (point,dimension,sx,cbmask);
|
|
||||||
|
|
||||||
int rank = grid->_processor;
|
int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
|
||||||
int unified_buffer_offset = _unified_buffer_size;
|
int unified_buffer_offset = _unified_buffer_size;
|
||||||
_unified_buffer_size += words;
|
_unified_buffer_size += words;
|
||||||
// std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
|
|
||||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -441,39 +516,36 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// CartesianStencil(GridBase *grid,
|
|
||||||
// int npoints,
|
|
||||||
// int checkerboard,
|
|
||||||
// const std::vector<int> &directions,
|
|
||||||
// const std::vector<int> &distances);
|
|
||||||
|
|
||||||
|
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||||
|
Mergers.resize(0);
|
||||||
|
Packets.resize(0);
|
||||||
|
HaloGather(source,compress);
|
||||||
|
return std::thread([&] { this->Communicate(); });
|
||||||
|
}
|
||||||
|
|
||||||
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
|
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
// Can this be avoided with simpler coding of comms?
|
|
||||||
// void Local (int point, int dimension,int shift,int cbmask);
|
|
||||||
// void Comms (int point, int dimension,int shift,int cbmask);
|
|
||||||
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
|
|
||||||
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
|
|
||||||
|
|
||||||
// Could allow a functional munging of the halo to another type during the comms.
|
|
||||||
// this could implement the 16bit/32bit/64bit compression.
|
|
||||||
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
|
||||||
{
|
{
|
||||||
std::thread thr = HaloExchangeBegin(source,u_comm_buf,compress);
|
auto thr = HaloExchangeBegin(source,compress);
|
||||||
|
HaloExchangeComplete(thr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HaloExchangeComplete(std::thread &thr)
|
||||||
|
{
|
||||||
|
CommsMerge(); // spins
|
||||||
|
jointime-=usecond();
|
||||||
thr.join();
|
thr.join();
|
||||||
|
jointime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > & u_comm_buf,compressor &compress) {
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); });
|
|
||||||
}
|
|
||||||
|
|
||||||
void HaloExchangeBlocking(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
|
||||||
{
|
{
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
halotime-=usecond();
|
halogtime-=usecond();
|
||||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
|
||||||
int u_comm_offset=0;
|
assert (comm_buf.size() == _unified_buffer_size );
|
||||||
|
u_comm_offset=0;
|
||||||
|
|
||||||
// Gather all comms buffers
|
// Gather all comms buffers
|
||||||
for(int point = 0 ; point < _npoints; point++) {
|
for(int point = 0 ; point < _npoints; point++) {
|
||||||
@ -506,35 +578,34 @@ namespace Grid {
|
|||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
if (splice_dim) {
|
if (splice_dim) {
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
GatherSimd(source,dimension,shift,0x3,compress);
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
|
Gather(source,dimension,shift,0x3,compress);
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
|
||||||
if(splice_dim){
|
if(splice_dim){
|
||||||
splicetime-=usecond();
|
splicetime-=usecond();
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
|
GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes
|
||||||
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
|
GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration
|
||||||
splicetime+=usecond();
|
splicetime+=usecond();
|
||||||
} else {
|
} else {
|
||||||
nosplicetime-=usecond();
|
nosplicetime-=usecond();
|
||||||
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
|
Gather(source,dimension,shift,0x1,compress);
|
||||||
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
|
Gather(source,dimension,shift,0x2,compress);
|
||||||
nosplicetime+=usecond();
|
nosplicetime+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
halotime+=usecond();
|
|
||||||
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
|
halogtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
||||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
|
||||||
int &u_comm_offset,compressor & compress)
|
|
||||||
{
|
{
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
typedef typename cobj::scalar_type scalar_type;
|
typedef typename cobj::scalar_type scalar_type;
|
||||||
@ -555,8 +626,6 @@ namespace Grid {
|
|||||||
|
|
||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
|
|
||||||
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||||
|
|
||||||
@ -573,7 +642,7 @@ namespace Grid {
|
|||||||
int bytes = words * sizeof(cobj);
|
int bytes = words * sizeof(cobj);
|
||||||
|
|
||||||
gathertime-=usecond();
|
gathertime-=usecond();
|
||||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
|
Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset);
|
||||||
gathertime+=usecond();
|
gathertime+=usecond();
|
||||||
|
|
||||||
int rank = _grid->_processor;
|
int rank = _grid->_processor;
|
||||||
@ -584,27 +653,28 @@ namespace Grid {
|
|||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||||
commtime-=usecond();
|
/*
|
||||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&u_comm_buf[u_comm_offset],
|
(void *)&comm_buf[u_comm_offset],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
commtime+=usecond();
|
*/
|
||||||
|
AddPacket((void *)&u_send_buf[u_comm_offset],
|
||||||
|
(void *)&comm_buf[u_comm_offset],
|
||||||
|
xmit_to_rank,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
|
||||||
u_comm_offset+=words;
|
u_comm_offset+=words;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
|
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
||||||
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
|
|
||||||
int &u_comm_offset,compressor &compress)
|
|
||||||
{
|
{
|
||||||
buftime-=usecond();
|
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
|
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -628,22 +698,11 @@ namespace Grid {
|
|||||||
|
|
||||||
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
||||||
|
|
||||||
// Should grow to max size and then cost very little thereafter
|
|
||||||
send_buf_extract.resize(Nsimd);
|
|
||||||
recv_buf_extract.resize(Nsimd);
|
|
||||||
for(int l=0;l<Nsimd;l++){
|
|
||||||
if( send_buf_extract[l].size() < buffer_size) {
|
|
||||||
send_buf_extract[l].resize(buffer_size);
|
|
||||||
recv_buf_extract[l].resize(buffer_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pointers.resize(Nsimd);
|
|
||||||
rpointers.resize(Nsimd);
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
buftime+=usecond();
|
std::vector<scalar_object *> rpointers(Nsimd);
|
||||||
|
std::vector<scalar_object *> spointers(Nsimd);
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -659,16 +718,19 @@ namespace Grid {
|
|||||||
if ( any_offnode ) {
|
if ( any_offnode ) {
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = &send_buf_extract[i][0];
|
spointers[i] = &u_simd_send_buf[i][u_comm_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
gathermtime-=usecond();
|
gathermtime-=usecond();
|
||||||
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
Gather_plane_extract<cobj>(rhs,spointers,dimension,sx,cbmask,compress);
|
||||||
gathermtime+=usecond();
|
gathermtime+=usecond();
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
|
||||||
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
int ic= (i&inner_bit)? 1:0;
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
@ -680,45 +742,41 @@ namespace Grid {
|
|||||||
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer
|
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer
|
||||||
int nbr_lane = (i&(~inner_bit));
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
assert (sx == nbr_ox);
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
||||||
|
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
||||||
|
|
||||||
|
void *vrp = (void *)rp;
|
||||||
|
void *vsp = (void *)sp;
|
||||||
|
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
commstime-=usecond();
|
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
|
||||||
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&recv_buf_extract[i][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
commstime+=usecond();
|
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = rp;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
|
||||||
|
rpointers[i] = sp;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size <<" unified buffer size "<<_unified_buffer_size<<std::endl;
|
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
||||||
mergetime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
u_comm_offset +=buffer_size;
|
||||||
for(int i=0;i<buffer_size;i++){
|
|
||||||
// std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
|
|
||||||
// assert(u_comm_offset+i<_unified_buffer_size);
|
|
||||||
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
|
||||||
}
|
|
||||||
mergetime+=usecond();
|
|
||||||
u_comm_offset+=buffer_size;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -204,7 +204,6 @@ namespace Grid {
|
|||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
std::vector<siteVector,alignedAllocator<siteVector> > comm_buf;
|
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -217,7 +216,7 @@ namespace Grid {
|
|||||||
conformable(in._grid,out._grid);
|
conformable(in._grid,out._grid);
|
||||||
|
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<Grid()->oSites();ss++){
|
for(int ss=0;ss<Grid()->oSites();ss++){
|
||||||
@ -234,7 +233,7 @@ PARALLEL_FOR_LOOP
|
|||||||
} else if(SE->_is_local) {
|
} else if(SE->_is_local) {
|
||||||
nbr = in._odata[SE->_offset];
|
nbr = in._odata[SE->_offset];
|
||||||
} else {
|
} else {
|
||||||
nbr = comm_buf[SE->_offset];
|
nbr = Stencil.comm_buf[SE->_offset];
|
||||||
}
|
}
|
||||||
res = res + A[point]._odata[ss]*nbr;
|
res = res + A[point]._odata[ss]*nbr;
|
||||||
}
|
}
|
||||||
@ -258,7 +257,6 @@ PARALLEL_FOR_LOOP
|
|||||||
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
|
||||||
A(geom.npoint,&CoarseGrid)
|
A(geom.npoint,&CoarseGrid)
|
||||||
{
|
{
|
||||||
comm_buf.resize(Stencil._unified_buffer_size);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
|
@ -44,7 +44,7 @@ public:
|
|||||||
// Gather for when there is no need to SIMD split with compression
|
// Gather for when there is no need to SIMD split with compression
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj,class compressor> void
|
template<class vobj,class cobj,class compressor> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress)
|
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs._grid->_rdimensions[dimension];
|
int rd = rhs._grid->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int bo = n*rhs._grid->_slice_block[dimension];
|
int bo = n*rhs._grid->_slice_block[dimension];
|
||||||
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*rhs._grid->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -206,6 +206,9 @@ typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonP
|
|||||||
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
|
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
|
||||||
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
|
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
|
||||||
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
|
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
|
||||||
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
|
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
|
||||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
|
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
|
||||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
|
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
|
||||||
|
@ -58,7 +58,6 @@ namespace QCD {
|
|||||||
UmuOdd (&Hgrid)
|
UmuOdd (&Hgrid)
|
||||||
{
|
{
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,7 +152,7 @@ namespace QCD {
|
|||||||
FermionField Atilde(B._grid);
|
FermionField Atilde(B._grid);
|
||||||
Atilde = A;
|
Atilde = A;
|
||||||
|
|
||||||
st.HaloExchange(B,comm_buf,compressor);
|
st.HaloExchange(B,compressor);
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
@ -168,7 +167,7 @@ namespace QCD {
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<B._grid->oSites();sss++){
|
for(int sss=0;sss<B._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
|
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
|
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
|
|||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in,comm_buf,compressor);
|
st.HaloExchange(in,compressor);
|
||||||
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
|
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||||
comms_thread.join();
|
|
||||||
|
|
||||||
bool local = true;
|
bool local = true;
|
||||||
bool nonlocal = false;
|
bool nonlocal = false;
|
||||||
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
|
|||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
st.HaloExchangeComplete(handle);
|
||||||
|
|
||||||
local = false;
|
local = false;
|
||||||
nonlocal = true;
|
nonlocal = true;
|
||||||
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
|
|||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if( HandOptDslash ) {
|
if( HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -152,9 +152,6 @@ namespace Grid {
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
// Comms buffer
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -98,12 +98,12 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
|
||||||
|
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
|
alltime=0;
|
||||||
commtime=0;
|
commtime=0;
|
||||||
jointime=0;
|
jointime=0;
|
||||||
dslashtime=0;
|
dslashtime=0;
|
||||||
|
dslash1time=0;
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
@ -121,7 +121,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
|||||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
||||||
|
|
||||||
Compressor compressor(DaggerNo);
|
Compressor compressor(DaggerNo);
|
||||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
|
||||||
int skip = (disp==1) ? 0 : 1;
|
int skip = (disp==1) ? 0 : 1;
|
||||||
|
|
||||||
@ -136,7 +136,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
|
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -159,7 +159,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
|||||||
FermionField Btilde(B._grid);
|
FermionField Btilde(B._grid);
|
||||||
FermionField Atilde(B._grid);
|
FermionField Atilde(B._grid);
|
||||||
|
|
||||||
st.HaloExchange(B,comm_buf,compressor);
|
st.HaloExchange(B,compressor);
|
||||||
|
|
||||||
Atilde=A;
|
Atilde=A;
|
||||||
|
|
||||||
@ -184,7 +184,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert ( sF< B._grid->oSites());
|
assert ( sF< B._grid->oSites());
|
||||||
assert ( sU< U._grid->oSites());
|
assert ( sU< U._grid->oSites());
|
||||||
|
|
||||||
Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
|
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
|
||||||
|
|
||||||
////////////////////////////
|
////////////////////////////
|
||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
@ -235,22 +235,25 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Report(void)
|
void WilsonFermion5D<Impl>::Report(void)
|
||||||
{
|
{
|
||||||
|
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -288,7 +291,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
alltime-=usecond();
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||||
@ -299,11 +302,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
int nwork = U._grid->oSites();
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
commtime -=usecond();
|
commtime -=usecond();
|
||||||
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
|
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||||
|
st.HaloExchangeComplete(handle);
|
||||||
commtime +=usecond();
|
commtime +=usecond();
|
||||||
|
|
||||||
jointime -=usecond();
|
jointime -=usecond();
|
||||||
thr.join();
|
|
||||||
jointime +=usecond();
|
jointime +=usecond();
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
@ -319,7 +322,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -330,7 +333,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(sd=0;sd<Ls;sd++){
|
for(sd=0;sd<Ls;sd++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = sd+Ls*sU;
|
int sF = sd+Ls*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -362,7 +365,7 @@ PARALLEL_FOR_LOOP
|
|||||||
sU = lo.Reorder(sU);
|
sU = lo.Reorder(sU);
|
||||||
}
|
}
|
||||||
sF = s+Ls*sU;
|
sF = s+Ls*sU;
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -387,7 +390,7 @@ PARALLEL_FOR_LOOP
|
|||||||
sU=ss+ ssoff;
|
sU=ss+ ssoff;
|
||||||
for(int s=soff;s<soff+swork;s++){
|
for(int s=soff;s<soff+swork;s++){
|
||||||
sF = s+Ls*sU;
|
sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -398,7 +401,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -407,12 +410,13 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dslashtime +=usecond();
|
dslashtime +=usecond();
|
||||||
|
alltime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -421,7 +425,10 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
|||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
alltime-=usecond();
|
||||||
|
|
||||||
|
int calls;
|
||||||
|
int updates;
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||||
@ -432,7 +439,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
|||||||
int nwork = U._grid->oSites();
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
commtime -=usecond();
|
commtime -=usecond();
|
||||||
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
|
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||||
commtime +=usecond();
|
commtime +=usecond();
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
@ -450,7 +457,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -461,7 +468,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(sd=0;sd<Ls;sd++){
|
for(sd=0;sd<Ls;sd++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = sd+Ls*sU;
|
int sF = sd+Ls*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -473,7 +480,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -482,7 +489,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -490,12 +497,12 @@ PARALLEL_FOR_LOOP
|
|||||||
dslashtime +=usecond();
|
dslashtime +=usecond();
|
||||||
|
|
||||||
jointime -=usecond();
|
jointime -=usecond();
|
||||||
thr.join();
|
st.HaloExchangeComplete(handle);
|
||||||
jointime +=usecond();
|
jointime +=usecond();
|
||||||
|
|
||||||
local = false;
|
local = false;
|
||||||
nonlocal = true;
|
nonlocal = true;
|
||||||
dslashtime -=usecond();
|
dslash1time -=usecond();
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( this->HandOptDslash ) {
|
if( this->HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
@ -503,7 +510,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -514,7 +521,7 @@ PARALLEL_FOR_LOOP
|
|||||||
for(sd=0;sd<Ls;sd++){
|
for(sd=0;sd<Ls;sd++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = sd+Ls*sU;
|
int sF = sd+Ls*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -526,7 +533,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -535,13 +542,13 @@ PARALLEL_FOR_LOOP
|
|||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dslashtime +=usecond();
|
dslash1time +=usecond();
|
||||||
|
alltime+=usecond();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,9 +61,11 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
|
double alltime;
|
||||||
double jointime;
|
double jointime;
|
||||||
double commtime;
|
double commtime;
|
||||||
double dslashtime;
|
double dslashtime;
|
||||||
|
double dslash1time;
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -60,15 +60,10 @@ namespace Grid {
|
|||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
#if 0
|
||||||
|
<<<<<<< HEAD
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||||
// int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
|
// int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
|
||||||
#if 0
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
|
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// doesn't seem to work with Gparity at the moment
|
// doesn't seem to work with Gparity at the moment
|
||||||
#undef HANDOPT
|
#undef HANDOPT
|
||||||
#if 1
|
#if 1
|
||||||
@ -79,7 +74,18 @@ namespace Grid {
|
|||||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||||
// int sF,int sU,const FermionField &in, FermionField &out);
|
#endif
|
||||||
|
#else
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||||
|
|
||||||
|
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||||
|
|
||||||
|
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||||
|
//>>>>>>> fc6ad657514c7966291c19f22af89de5d5a96f93
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
WilsonKernels(const ImplParams &p= ImplParams());
|
WilsonKernels(const ImplParams &p= ImplParams());
|
||||||
|
@ -310,7 +310,7 @@ namespace QCD {
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||||
{
|
{
|
||||||
@ -318,21 +318,21 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
|||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd result_00; // 12 regs on knc
|
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||||
REGISTER Simd result_01;
|
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||||
REGISTER Simd result_02;
|
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||||
|
|
||||||
REGISTER Simd result_10;
|
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||||
REGISTER Simd result_11;
|
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||||
REGISTER Simd result_12;
|
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||||
|
|
||||||
REGISTER Simd result_20;
|
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||||
REGISTER Simd result_21;
|
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||||
REGISTER Simd result_22;
|
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||||
|
|
||||||
REGISTER Simd result_30;
|
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||||
REGISTER Simd result_31;
|
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||||
REGISTER Simd result_32; // 20 left
|
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||||
|
|
||||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||||
REGISTER Simd Chi_01;
|
REGISTER Simd Chi_01;
|
||||||
@ -372,172 +372,178 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
|||||||
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset, ptype;
|
||||||
|
int num = 0;
|
||||||
|
|
||||||
// Xp
|
// Xp
|
||||||
SE=st.GetEntry(ptype,Xp,ss);
|
SE=st.GetEntry(ptype,Xp,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XP_PROJ;
|
XP_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Xp);
|
MULT_2SPIN(Xp);
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
XP_RECON;
|
|
||||||
|
|
||||||
// Yp
|
// Yp
|
||||||
SE=st.GetEntry(ptype,Yp,ss);
|
SE=st.GetEntry(ptype,Yp,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YP_PROJ;
|
YP_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Yp);
|
MULT_2SPIN(Yp);
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
YP_RECON_ACCUM;
|
|
||||||
|
|
||||||
|
|
||||||
// Zp
|
// Zp
|
||||||
SE=st.GetEntry(ptype,Zp,ss);
|
SE=st.GetEntry(ptype,Zp,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZP_PROJ;
|
ZP_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Zp);
|
MULT_2SPIN(Zp);
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
ZP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tp
|
// Tp
|
||||||
SE=st.GetEntry(ptype,Tp,ss);
|
SE=st.GetEntry(ptype,Tp,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TP_PROJ;
|
TP_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Tp);
|
MULT_2SPIN(Tp);
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
TP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Xm
|
// Xm
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
SE=st.GetEntry(ptype,Xm,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
XM_PROJ;
|
XM_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Xm);
|
MULT_2SPIN(Xm);
|
||||||
|
XM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
XM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Ym
|
// Ym
|
||||||
SE=st.GetEntry(ptype,Ym,ss);
|
SE=st.GetEntry(ptype,Ym,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
YM_PROJ;
|
YM_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Ym);
|
MULT_2SPIN(Ym);
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
YM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Zm
|
// Zm
|
||||||
SE=st.GetEntry(ptype,Zm,ss);
|
SE=st.GetEntry(ptype,Zm,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
ZM_PROJ;
|
ZM_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Zm);
|
MULT_2SPIN(Zm);
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
ZM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tm
|
// Tm
|
||||||
SE=st.GetEntry(ptype,Tm,ss);
|
SE=st.GetEntry(ptype,Tm,ss);
|
||||||
offset = SE->_offset;
|
offset = SE->_offset;
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
if (Local && SE->_is_local ) {
|
||||||
LOAD_CHIMU;
|
LOAD_CHIMU;
|
||||||
TM_PROJ;
|
TM_PROJ;
|
||||||
if ( perm) {
|
if ( SE->_permute ) {
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
LOAD_CHI;
|
LOAD_CHI;
|
||||||
}
|
}
|
||||||
{
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
MULT_2SPIN(Tm);
|
MULT_2SPIN(Tm);
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
}
|
}
|
||||||
TM_RECON_ACCUM;
|
|
||||||
|
|
||||||
{
|
SiteSpinor & ref (out._odata[ss]);
|
||||||
SiteSpinor & ref (out._odata[ss]);
|
if ( Local ) {
|
||||||
vstream(ref()(0)(0),result_00*(-0.5));
|
vstream(ref()(0)(0),result_00*(-0.5));
|
||||||
vstream(ref()(0)(1),result_01*(-0.5));
|
vstream(ref()(0)(1),result_01*(-0.5));
|
||||||
vstream(ref()(0)(2),result_02*(-0.5));
|
vstream(ref()(0)(2),result_02*(-0.5));
|
||||||
@ -550,9 +556,295 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
|||||||
vstream(ref()(3)(0),result_30*(-0.5));
|
vstream(ref()(3)(0),result_30*(-0.5));
|
||||||
vstream(ref()(3)(1),result_31*(-0.5));
|
vstream(ref()(3)(1),result_31*(-0.5));
|
||||||
vstream(ref()(3)(2),result_32*(-0.5));
|
vstream(ref()(3)(2),result_32*(-0.5));
|
||||||
|
return 1;
|
||||||
|
} else if ( num ) {
|
||||||
|
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||||
|
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||||
|
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
||||||
|
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
||||||
|
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
||||||
|
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
||||||
|
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
||||||
|
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
||||||
|
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
||||||
|
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||||
|
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||||
|
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||||
|
{
|
||||||
|
// std::cout << "Hand op Dhop "<<std::endl;
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||||
|
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||||
|
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||||
|
|
||||||
|
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||||
|
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||||
|
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||||
|
|
||||||
|
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||||
|
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||||
|
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||||
|
|
||||||
|
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||||
|
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||||
|
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||||
|
|
||||||
|
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||||
|
REGISTER Simd Chi_01;
|
||||||
|
REGISTER Simd Chi_02;
|
||||||
|
|
||||||
|
REGISTER Simd Chi_10;
|
||||||
|
REGISTER Simd Chi_11;
|
||||||
|
REGISTER Simd Chi_12; // 14 left
|
||||||
|
|
||||||
|
REGISTER Simd UChi_00; // two spinor; 6 regs
|
||||||
|
REGISTER Simd UChi_01;
|
||||||
|
REGISTER Simd UChi_02;
|
||||||
|
|
||||||
|
REGISTER Simd UChi_10;
|
||||||
|
REGISTER Simd UChi_11;
|
||||||
|
REGISTER Simd UChi_12; // 8 left
|
||||||
|
|
||||||
|
REGISTER Simd U_00; // two rows of U matrix
|
||||||
|
REGISTER Simd U_10;
|
||||||
|
REGISTER Simd U_20;
|
||||||
|
REGISTER Simd U_01;
|
||||||
|
REGISTER Simd U_11;
|
||||||
|
REGISTER Simd U_21; // 2 reg left.
|
||||||
|
|
||||||
|
#define Chimu_00 Chi_00
|
||||||
|
#define Chimu_01 Chi_01
|
||||||
|
#define Chimu_02 Chi_02
|
||||||
|
#define Chimu_10 Chi_10
|
||||||
|
#define Chimu_11 Chi_11
|
||||||
|
#define Chimu_12 Chi_12
|
||||||
|
#define Chimu_20 UChi_00
|
||||||
|
#define Chimu_21 UChi_01
|
||||||
|
#define Chimu_22 UChi_02
|
||||||
|
#define Chimu_30 UChi_10
|
||||||
|
#define Chimu_31 UChi_11
|
||||||
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset, ptype;
|
||||||
|
int num = 0;
|
||||||
|
|
||||||
|
// Xp
|
||||||
|
SE=st.GetEntry(ptype,Xp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
XM_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Xp);
|
||||||
|
XM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Yp
|
||||||
|
SE=st.GetEntry(ptype,Yp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
YM_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Yp);
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Zp
|
||||||
|
SE=st.GetEntry(ptype,Zp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
ZM_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Zp);
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tp
|
||||||
|
SE=st.GetEntry(ptype,Tp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
TM_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Tp);
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Xm
|
||||||
|
SE=st.GetEntry(ptype,Xm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
XP_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Xm);
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ym
|
||||||
|
SE=st.GetEntry(ptype,Ym,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
YP_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Ym);
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Zm
|
||||||
|
SE=st.GetEntry(ptype,Zm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
ZP_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Zm);
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tm
|
||||||
|
SE=st.GetEntry(ptype,Tm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
|
||||||
|
if (Local && SE->_is_local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
TP_PROJ;
|
||||||
|
if ( SE->_permute ) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( Nonlocal && (!SE->_is_local) ) {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||||
|
MULT_2SPIN(Tm);
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
num++;
|
||||||
|
}
|
||||||
|
|
||||||
|
SiteSpinor & ref (out._odata[ss]);
|
||||||
|
if ( Local ) {
|
||||||
|
vstream(ref()(0)(0),result_00*(-0.5));
|
||||||
|
vstream(ref()(0)(1),result_01*(-0.5));
|
||||||
|
vstream(ref()(0)(2),result_02*(-0.5));
|
||||||
|
vstream(ref()(1)(0),result_10*(-0.5));
|
||||||
|
vstream(ref()(1)(1),result_11*(-0.5));
|
||||||
|
vstream(ref()(1)(2),result_12*(-0.5));
|
||||||
|
vstream(ref()(2)(0),result_20*(-0.5));
|
||||||
|
vstream(ref()(2)(1),result_21*(-0.5));
|
||||||
|
vstream(ref()(2)(2),result_22*(-0.5));
|
||||||
|
vstream(ref()(3)(0),result_30*(-0.5));
|
||||||
|
vstream(ref()(3)(1),result_31*(-0.5));
|
||||||
|
vstream(ref()(3)(2),result_32*(-0.5));
|
||||||
|
return 1;
|
||||||
|
} else if ( num ) {
|
||||||
|
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||||
|
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||||
|
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
||||||
|
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
||||||
|
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
||||||
|
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
||||||
|
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
||||||
|
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
||||||
|
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
||||||
|
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||||
|
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||||
|
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
@ -795,7 +1087,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
|||||||
vstream(ref()(3)(2),result_32*(-0.5));
|
vstream(ref()(3)(2),result_32*(-0.5));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
|
@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj> inline
|
||||||
|
void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
scalar_type *pointer;
|
||||||
|
scalar_type *vp = (scalar_type *)&vec;
|
||||||
|
|
||||||
|
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointer=(scalar_type *)&extracted[i][offset];
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
vp[w*Nsimd+i] = pointer[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj> inline
|
||||||
|
void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
|
const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
scalar_type *pointer;
|
||||||
|
scalar_type *vp = (scalar_type *)&vec;
|
||||||
|
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointer=(scalar_type *)&extracted[i][offset];
|
||||||
|
vp[w*Nsimd+i] =pointer[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -99,9 +99,8 @@ int main (int argc, char ** argv)
|
|||||||
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<vobj,alignedAllocator<vobj> > comm_buf(myStencil._unified_buffer_size);
|
|
||||||
SimpleCompressor<vobj> compress;
|
SimpleCompressor<vobj> compress;
|
||||||
myStencil.HaloExchange(Foo,comm_buf,compress);
|
myStencil.HaloExchange(Foo,compress);
|
||||||
|
|
||||||
Bar = Cshift(Foo,dir,disp);
|
Bar = Cshift(Foo,dir,disp);
|
||||||
|
|
||||||
@ -117,7 +116,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
Check._odata[i] = Foo._odata[SE->_offset];
|
Check._odata[i] = Foo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
Check._odata[i] = comm_buf[SE->_offset];
|
Check._odata[i] = myStencil.comm_buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
Real nrmC = norm2(Check);
|
Real nrmC = norm2(Check);
|
||||||
@ -181,13 +180,10 @@ int main (int argc, char ** argv)
|
|||||||
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<vobj,alignedAllocator<vobj> > Ecomm_buf(EStencil._unified_buffer_size);
|
|
||||||
std::vector<vobj,alignedAllocator<vobj> > Ocomm_buf(OStencil._unified_buffer_size);
|
|
||||||
|
|
||||||
SimpleCompressor<vobj> compress;
|
SimpleCompressor<vobj> compress;
|
||||||
|
|
||||||
EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
|
EStencil.HaloExchange(EFoo,compress);
|
||||||
OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
|
OStencil.HaloExchange(OFoo,compress);
|
||||||
|
|
||||||
Bar = Cshift(Foo,dir,disp);
|
Bar = Cshift(Foo,dir,disp);
|
||||||
|
|
||||||
@ -211,7 +207,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
OCheck._odata[i] = EFoo._odata[SE->_offset];
|
OCheck._odata[i] = EFoo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
OCheck._odata[i] = Ecomm_buf[SE->_offset];
|
OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
|
||||||
}
|
}
|
||||||
for(int i=0;i<ECheck._grid->oSites();i++){
|
for(int i=0;i<ECheck._grid->oSites();i++){
|
||||||
int permute_type;
|
int permute_type;
|
||||||
@ -224,7 +220,7 @@ int main (int argc, char ** argv)
|
|||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
ECheck._odata[i] = OFoo._odata[SE->_offset];
|
ECheck._odata[i] = OFoo._odata[SE->_offset];
|
||||||
else
|
else
|
||||||
ECheck._odata[i] = Ocomm_buf[SE->_offset];
|
ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
setCheckerboard(Check,ECheck);
|
setCheckerboard(Check,ECheck);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user