mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Internal SHM comms in non-simd directions working
Need to fix simd directions
This commit is contained in:
parent
0fcd2e7188
commit
c190221fd3
@ -153,7 +153,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
Dw.Report();
|
||||
@ -192,7 +192,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
sDw.Report();
|
||||
|
||||
if(0){
|
||||
@ -262,7 +262,7 @@ int main (int argc, char ** argv)
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
|
||||
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
sDw.Report();
|
||||
|
||||
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||
@ -333,7 +333,7 @@ int main (int argc, char ** argv)
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
Dw.Report();
|
||||
}
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
|
121
lib/Stencil.h
121
lib/Stencil.h
@ -32,6 +32,8 @@
|
||||
|
||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||
|
||||
const int ShmDirectCopy = 1;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
// gather to a point stencil code. CSHIFT is not the best way, so need
|
||||
@ -68,7 +70,7 @@
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace Grid {
|
||||
namespace Grid {
|
||||
|
||||
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table)
|
||||
@ -117,29 +119,16 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj,class cobj,class compressor> void
|
||||
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
|
||||
double &t_table ,double & t_data )
|
||||
{
|
||||
std::vector<std::pair<int,int> > table;
|
||||
Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
struct StencilEntry {
|
||||
struct StencilEntry {
|
||||
uint64_t _offset;
|
||||
uint64_t _byte_offset;
|
||||
uint16_t _is_local;
|
||||
uint16_t _permute;
|
||||
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||
};
|
||||
};
|
||||
|
||||
template<class vobj,class cobj>
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
template<class vobj,class cobj>
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
@ -181,6 +170,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
|
||||
reqs.resize(Packets.size());
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if( ShmDirectCopy ) {
|
||||
_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
}else{
|
||||
_grid->SendToRecvFromBegin(reqs[i],
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
@ -188,12 +185,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
}
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if( ShmDirectCopy )
|
||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
||||
else
|
||||
_grid->SendToRecvFromComplete(reqs[i]);
|
||||
}
|
||||
commtime+=usecond();
|
||||
@ -259,7 +260,6 @@ PARALLEL_FOR_LOOP
|
||||
if( _entries[i]._is_local ) {
|
||||
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
|
||||
} else {
|
||||
// PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
|
||||
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
|
||||
}
|
||||
}
|
||||
@ -269,7 +269,7 @@ PARALLEL_FOR_LOOP
|
||||
// _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
|
||||
}
|
||||
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||
uint64_t cbase = (uint64_t)&comm_buf[0];
|
||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||
local = _entries[ent]._is_local;
|
||||
perm = _entries[ent]._permute;
|
||||
if (perm) ptype = _permute_type[point];
|
||||
@ -280,23 +280,26 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
||||
uint64_t cbase = (uint64_t)&comm_buf[0];
|
||||
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
|
||||
int local = _entries[ent]._is_local;
|
||||
if (local) return base + _entries[ent]._byte_offset;
|
||||
else return cbase + _entries[ent]._byte_offset;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Comms buffers
|
||||
// Unified Comms buffers for all directions
|
||||
///////////////////////////////////////////////////////////
|
||||
std::vector<commVector<scalar_object> > u_simd_send_buf;
|
||||
std::vector<commVector<scalar_object> > u_simd_recv_buf;
|
||||
commVector<cobj> u_send_buf;
|
||||
commVector<cobj> comm_buf;
|
||||
commVector<cobj> u_recv_buf_hide;
|
||||
cobj* u_recv_buf_p;
|
||||
|
||||
int u_comm_offset;
|
||||
int _unified_buffer_size;
|
||||
|
||||
cobj *CommBuf(void) { return u_recv_buf_p; }
|
||||
|
||||
/////////////////////////////////////////
|
||||
// Timing info; ugly; possibly temporary
|
||||
/////////////////////////////////////////
|
||||
@ -378,7 +381,6 @@ PARALLEL_FOR_LOOP
|
||||
int i = ii; // reverse direction to get SIMD comms done first
|
||||
int point = i;
|
||||
|
||||
|
||||
int dimension = directions[i];
|
||||
int displacement = distances[i];
|
||||
int shift = displacement;
|
||||
@ -426,7 +428,21 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
u_send_buf.resize(_unified_buffer_size);
|
||||
comm_buf.resize(_unified_buffer_size);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// Try to allocate for receiving in a shared memory region, fall back to buffer
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
if( ShmDirectCopy ) {
|
||||
|
||||
u_recv_buf_p=(cobj *)_grid->ShmBufferSelf();
|
||||
if ( u_recv_buf_p == NULL ) {
|
||||
u_recv_buf_hide.resize(_unified_buffer_size);
|
||||
u_recv_buf_p=&u_recv_buf_hide[0];
|
||||
}
|
||||
} else {
|
||||
u_recv_buf_hide.resize(_unified_buffer_size);
|
||||
u_recv_buf_p=&u_recv_buf_hide[0];
|
||||
}
|
||||
|
||||
PrecomputeByteOffsets();
|
||||
|
||||
@ -660,10 +676,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class compressor>
|
||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||
calls++;
|
||||
@ -675,8 +688,7 @@ PARALLEL_FOR_LOOP
|
||||
CommsMerge(); // spins
|
||||
}
|
||||
|
||||
template<class compressor>
|
||||
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||
{
|
||||
int dimension = _directions[point];
|
||||
int displacement = _distances[point];
|
||||
@ -734,7 +746,6 @@ PARALLEL_FOR_LOOP
|
||||
assert(source._grid==_grid);
|
||||
halogtime-=usecond();
|
||||
|
||||
assert (comm_buf.size() == _unified_buffer_size );
|
||||
u_comm_offset=0;
|
||||
|
||||
// Gather all comms buffers
|
||||
@ -779,9 +790,6 @@ PARALLEL_FOR_LOOP
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
cobj *u_send_buf_p;
|
||||
cobj *comm_buf_p;
|
||||
|
||||
if (comm_proc) {
|
||||
|
||||
int words = buffer_size;
|
||||
@ -794,36 +802,48 @@ PARALLEL_FOR_LOOP
|
||||
if ( !face_table_computed ) {
|
||||
t_table-=usecond();
|
||||
face_table.resize(face_idx+1);
|
||||
cobj *ptr; ptr = &u_send_buf[0];
|
||||
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
|
||||
face_table[face_idx]);
|
||||
t_table+=usecond();
|
||||
}
|
||||
t_data-=usecond();
|
||||
Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++;
|
||||
t_data+=usecond();
|
||||
gathertime+=usecond();
|
||||
|
||||
|
||||
int rank = _grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
assert (xmit_to_rank != _grid->ThisRank());
|
||||
assert (recv_from_rank != _grid->ThisRank());
|
||||
|
||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||
AddPacket((void *)&u_send_buf[u_comm_offset],
|
||||
(void *) &comm_buf[u_comm_offset],
|
||||
/////////////////////////////////////////////////////////
|
||||
// try the direct copy if possible
|
||||
/////////////////////////////////////////////////////////
|
||||
|
||||
cobj *u_send_buf_p = &u_send_buf[0];
|
||||
if (ShmDirectCopy) {
|
||||
cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
|
||||
if ( shm!=NULL) {
|
||||
u_send_buf_p = shm;
|
||||
}
|
||||
}
|
||||
|
||||
t_data-=usecond();
|
||||
Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so); face_idx++;
|
||||
t_data+=usecond();
|
||||
|
||||
AddPacket((void *)&u_send_buf_p[u_comm_offset],
|
||||
(void *)&u_recv_buf_p[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
gathertime+=usecond();
|
||||
u_comm_offset+=words;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class compressor>
|
||||
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
|
||||
{
|
||||
@ -904,10 +924,6 @@ PARALLEL_FOR_LOOP
|
||||
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
|
||||
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
|
||||
|
||||
void *vrp = (void *)rp;
|
||||
void *vsp = (void *)sp;
|
||||
|
||||
|
||||
if(nbr_proc){
|
||||
|
||||
int recv_from_rank;
|
||||
@ -915,7 +931,7 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
|
||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||
|
||||
rpointers[i] = rp;
|
||||
|
||||
@ -926,13 +942,14 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
|
||||
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
||||
assert(0);
|
||||
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
||||
|
||||
u_comm_offset +=buffer_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
@ -80,7 +80,6 @@ class CartesianCommunicator {
|
||||
|
||||
void * ShmCommBuf;
|
||||
std::vector<void *> ShmCommBufs;
|
||||
std::vector<void *> ShmStencilBufs;
|
||||
|
||||
int WorldRank;
|
||||
int WorldSize;
|
||||
@ -105,6 +104,10 @@ class CartesianCommunicator {
|
||||
int RankFromProcessorCoor(std::vector<int> &coor);
|
||||
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
|
||||
|
||||
// Helper function for SHM Windows in MPI3
|
||||
void *ShmBufferSelf(void);
|
||||
void *ShmBuffer(int rank);
|
||||
|
||||
/////////////////////////////////
|
||||
// Grid information queries
|
||||
/////////////////////////////////
|
||||
@ -173,6 +176,16 @@ class CartesianCommunicator {
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||
void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Barrier
|
||||
|
@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
|
@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Verbose for now
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
std::cout<< "Ranks per node "<< ShmSize << std::endl;
|
||||
std::cout<< "Nodes "<< GroupSize << std::endl;
|
||||
std::cout<< "Ranks "<< WorldSize << std::endl;
|
||||
std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
|
||||
std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
|
||||
std::cout<< " Nodes "<< GroupSize;
|
||||
std::cout<< " Ranks "<< WorldSize;
|
||||
std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
|
||||
|
||||
// Done
|
||||
ShmSetup=1;
|
||||
@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
}
|
||||
|
||||
ShmCommBufs.resize(ShmSize);
|
||||
ShmStencilBufs.resize(ShmSize);
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
MPI_Aint sz;
|
||||
int dsp_unit;
|
||||
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
|
||||
ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
ShmCoor.resize(_ndimension);
|
||||
GroupCoor.resize(_ndimension);
|
||||
WorldCoor.resize(_ndimension);
|
||||
|
||||
for(int l2=0;l2<log2size;l2++){
|
||||
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
|
||||
ShmDims[dim]*=2;
|
||||
@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||
{
|
||||
return ShmCommBufs[ShmRank];
|
||||
}
|
||||
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||
{
|
||||
int gpeer = GroupRanks[rank];
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
return ShmCommBufs[gpeer];
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
#undef SHM_USE_BCOPY
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
static int sequence;
|
||||
|
||||
int rank = _processor;
|
||||
int ierr;
|
||||
int tag;
|
||||
int check;
|
||||
@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
assert(from != _processor);
|
||||
|
||||
int gdest = GroupRanks[dest];
|
||||
int gfrom = GroupRanks[from];
|
||||
int gme = GroupRanks[_processor];
|
||||
|
||||
sequence++;
|
||||
@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
|
||||
int small = (bytes<MAX_MPI_SHM_BYTES);
|
||||
|
||||
#ifndef SHM_USE_BCOPY
|
||||
typedef vRealD T;
|
||||
int words = bytes/sizeof(T);
|
||||
assert(((size_t)bytes &(sizeof(T)-1))==0);
|
||||
// assert(((size_t)xmit &(sizeof(T)-1))==0);
|
||||
// assert(((size_t)recv &(sizeof(T)-1))==0);
|
||||
#endif
|
||||
|
||||
assert(((size_t)bytes &(sizeof(T)-1))==0);
|
||||
assert(gme == ShmRank);
|
||||
|
||||
// std::cerr << "proc dest from gme gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
|
||||
if ( small && (dest !=MPI_UNDEFINED) ) {
|
||||
if ( small && (gdest !=MPI_UNDEFINED) ) {
|
||||
|
||||
assert(gme != gdest);
|
||||
|
||||
#ifdef SHM_USE_BCOPY
|
||||
bcopy(xmit,to_ptr,bytes);
|
||||
#else
|
||||
T *ip = (T *)xmit;
|
||||
T *op = (T *)to_ptr;
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int w=0;w<words;w++) {
|
||||
vstream(op[w],ip[w]);
|
||||
}
|
||||
#endif
|
||||
|
||||
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
|
||||
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
|
||||
} else {
|
||||
@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
list.push_back(xrq);
|
||||
}
|
||||
|
||||
// std::cout << "Syncing "<<std::endl; Barrier();
|
||||
MPI_Win_sync (ShmWindow);
|
||||
MPI_Barrier (ShmComm);
|
||||
MPI_Win_sync (ShmWindow);
|
||||
|
||||
// std::cout << "Receiving "<<std::endl; Barrier();
|
||||
|
||||
if (small && (from !=MPI_UNDEFINED) ) {
|
||||
#ifdef SHM_USE_BCOPY
|
||||
bcopy(from_ptr,recv,bytes);
|
||||
#else
|
||||
if (small && (gfrom !=MPI_UNDEFINED) ) {
|
||||
T *ip = (T *)from_ptr;
|
||||
T *op = (T *)recv;
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int w=0;w<words;w++) {
|
||||
vstream(op[w],ip[w]);
|
||||
}
|
||||
#endif
|
||||
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
|
||||
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
|
||||
assert(check==sequence);
|
||||
@ -439,27 +438,51 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
list.push_back(rrq);
|
||||
}
|
||||
|
||||
// std::cout << "Syncing"<<std::endl; Barrier();
|
||||
MPI_Win_sync (ShmWindow);
|
||||
MPI_Barrier (ShmComm);
|
||||
MPI_Win_sync (ShmWindow);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
|
||||
int gdest = GroupRanks[dest];
|
||||
int gfrom = GroupRanks[from];
|
||||
int gme = GroupRanks[_processor];
|
||||
|
||||
assert(gme == ShmRank);
|
||||
|
||||
if ( gdest == MPI_UNDEFINED ) {
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
}
|
||||
|
||||
if ( gfrom ==MPI_UNDEFINED) {
|
||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
}
|
||||
|
||||
MPI_Win_sync (ShmWindow);
|
||||
MPI_Barrier (ShmComm);
|
||||
MPI_Win_sync (ShmWindow);
|
||||
|
||||
#if 0
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
int rank = _processor;
|
||||
int ierr;
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
|
||||
assert(ierr==0);
|
||||
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
|
||||
}
|
||||
|
||||
int Rank(void ){ return 0; };
|
||||
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
|
@ -50,6 +50,14 @@ typedef struct HandShake_t {
|
||||
static Vector< HandShake > XConnections;
|
||||
static Vector< HandShake > RConnections;
|
||||
|
||||
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
shmem_init();
|
||||
XConnections.resize(shmem_n_pes());
|
||||
|
@ -33,8 +33,7 @@ directory
|
||||
#define GRID_QCD_FERMION_OPERATOR_IMPL_H
|
||||
|
||||
namespace Grid {
|
||||
|
||||
namespace QCD {
|
||||
namespace QCD {
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -108,13 +107,14 @@ namespace Grid {
|
||||
INHERIT_GIMPL_TYPES(Base) \
|
||||
INHERIT_FIMPL_TYPES(Base)
|
||||
|
||||
///////
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour four spinors with colour index
|
||||
///////
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
|
||||
class WilsonImpl
|
||||
: public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||
|
||||
public:
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||
|
||||
@ -124,7 +124,6 @@ namespace Grid {
|
||||
const bool LsVectorised=false;
|
||||
typedef _Coeff_t Coeff_t;
|
||||
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||
@ -158,8 +157,7 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <class ref>
|
||||
inline void loadLinkElement(Simd ®,
|
||||
ref &memory) {
|
||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||
reg = memory;
|
||||
}
|
||||
|
||||
@ -202,11 +200,12 @@ namespace Grid {
|
||||
}
|
||||
};
|
||||
|
||||
///////
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour four spinors with colour index, 5d redblack
|
||||
///////
|
||||
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
|
||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
|
||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||
public:
|
||||
|
||||
static const int Dimension = Nrepresentation;
|
||||
@ -227,12 +226,9 @@ namespace Grid {
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
|
||||
// Make the doubled gauge field a *scalar*
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type>
|
||||
SiteDoubledGaugeField; // This is a scalar
|
||||
typedef iImplGaugeField<typename Simd::scalar_type>
|
||||
SiteScalarGaugeField; // scalar
|
||||
typedef iImplGaugeLink<typename Simd::scalar_type>
|
||||
SiteScalarGaugeLink; // scalar
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
@ -250,6 +246,7 @@ namespace Grid {
|
||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||
vsplat(reg, memory);
|
||||
}
|
||||
|
||||
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
||||
StencilImpl &St) {
|
||||
@ -262,8 +259,8 @@ namespace Grid {
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
|
||||
const GaugeField &Umu) {
|
||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
SiteScalarGaugeField ScalarUmu;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
@ -289,25 +286,25 @@ namespace Grid {
|
||||
}
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
|
||||
FermionField &A, int mu) {
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
|
||||
FermionField Ã, int mu) {
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField Ã, int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Flavour doubled spinors; is Gparity the only? what about C*?
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class S, int Nrepresentation,class _Coeff_t = RealD>
|
||||
class GparityWilsonImpl
|
||||
: public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
||||
template <class S, int Nrepresentation,class _Coeff_t = RealD>
|
||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
||||
public:
|
||||
|
||||
static const int Dimension = Nrepresentation;
|
||||
|
||||
const bool LsVectorised=false;
|
||||
@ -317,15 +314,9 @@ namespace Grid {
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype>
|
||||
using iImplSpinor =
|
||||
iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||
template <typename vtype>
|
||||
using iImplHalfSpinor =
|
||||
iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||
template <typename vtype>
|
||||
using iImplDoubledGaugeField =
|
||||
iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
@ -341,7 +332,6 @@ namespace Grid {
|
||||
|
||||
ImplParams Params;
|
||||
|
||||
|
||||
GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
||||
|
||||
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
|
||||
@ -351,6 +341,7 @@ namespace Grid {
|
||||
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
||||
StencilImpl &St) {
|
||||
|
||||
typedef SiteHalfSpinor vobj;
|
||||
typedef typename SiteHalfSpinor::scalar_object sobj;
|
||||
|
||||
@ -419,7 +410,6 @@ namespace Grid {
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
|
||||
conformable(Uds._grid,GaugeGrid);
|
||||
conformable(Umu._grid,GaugeGrid);
|
||||
|
||||
@ -429,7 +419,6 @@ namespace Grid {
|
||||
|
||||
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
||||
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
LatticeCoordinate(coor,mu);
|
||||
@ -443,8 +432,7 @@ namespace Grid {
|
||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||
}
|
||||
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(auto ss=U.begin();ss<U.end();ss++){
|
||||
Uds[ss](0)(mu) = U[ss]();
|
||||
Uds[ss](1)(mu) = Uconj[ss]();
|
||||
@ -458,7 +446,7 @@ namespace Grid {
|
||||
Utmp = where(coor==0,Uconj,Utmp);
|
||||
}
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(auto ss=U.begin();ss<U.end();ss++){
|
||||
Uds[ss](0)(mu+4) = Utmp[ss]();
|
||||
}
|
||||
@ -468,7 +456,7 @@ namespace Grid {
|
||||
Utmp = where(coor==0,U,Utmp);
|
||||
}
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for(auto ss=U.begin();ss<U.end();ss++){
|
||||
Uds[ss](1)(mu+4) = Utmp[ss]();
|
||||
}
|
||||
@ -477,13 +465,13 @@ namespace Grid {
|
||||
}
|
||||
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
|
||||
FermionField &A, int mu) {
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
|
||||
|
||||
// DhopDir provides U or Uconj depending on coor/flavour.
|
||||
GaugeLinkField link(mat._grid);
|
||||
// use lorentz for flavour as hack.
|
||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
|
||||
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
|
||||
}
|
||||
@ -491,13 +479,13 @@ namespace Grid {
|
||||
return;
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
|
||||
FermionField Ã, int mu) {
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||
|
||||
int Ls = Btilde._grid->_fdimensions[0];
|
||||
|
||||
GaugeLinkField tmp(mat._grid);
|
||||
tmp = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
int sF = s + Ls * ss;
|
||||
@ -508,13 +496,13 @@ namespace Grid {
|
||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
|
||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
|
||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
|
||||
|
||||
|
||||
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
|
||||
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
|
||||
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
|
||||
@ -535,9 +523,10 @@ namespace Grid {
|
||||
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
|
||||
|
||||
typedef GparityWilsonImpl<vComplex, Nc> GparityWilsonImplR; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
|
||||
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
|
@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
////////////////////////
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < B._grid->oSites(); sss++) {
|
||||
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
|
||||
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
|
||||
gamma);
|
||||
}
|
||||
|
||||
@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
|
||||
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
|
||||
dirdisp, gamma);
|
||||
}
|
||||
};
|
||||
@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
if (dag == DaggerYes) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
|
||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
|
||||
out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
|
||||
Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
|
||||
out);
|
||||
}
|
||||
}
|
||||
|
@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
if ( DhopCalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
|
||||
<< " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : "
|
||||
<< DhopComputeTime << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
|
||||
}
|
||||
|
||||
@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
|
||||
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
||||
|
||||
|
||||
|
||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||
|
||||
}
|
||||
|
||||
if (DerivCalls > 0 || DhopCalls > 0){
|
||||
@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sU=ss;
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
assert(sF < B._grid->oSites());
|
||||
assert(sU < U._grid->oSites());
|
||||
|
||||
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
|
||||
gamma);
|
||||
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
|
||||
|
||||
////////////////////////////
|
||||
// spin trace outer product
|
||||
@ -342,7 +334,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopDeriv( GaugeField &mat,
|
||||
void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag)
|
||||
@ -412,26 +404,24 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
||||
out);
|
||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
|
||||
}
|
||||
#ifdef AVX512
|
||||
} else if (stat.is_init() ) {
|
||||
|
||||
int nthreads;
|
||||
stat.start();
|
||||
#pragma omp parallel
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
#pragma omp master
|
||||
nthreads = omp_get_num_threads();
|
||||
int mythread = omp_get_thread_num();
|
||||
stat.enter(mythread);
|
||||
#pragma omp for nowait
|
||||
for(int ss=0;ss<U._grid->oSites();ss++)
|
||||
{
|
||||
#pragma omp for nowait
|
||||
for(int ss=0;ss<U._grid->oSites();ss++) {
|
||||
int sU=ss;
|
||||
int sF=LLs*sU;
|
||||
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||
}
|
||||
stat.exit(mythread);
|
||||
}
|
||||
@ -442,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
||||
out);
|
||||
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||
}
|
||||
}
|
||||
DhopComputeTime+=usecond();
|
||||
|
@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/Stat.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
namespace QCD {
|
||||
namespace QCD {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// This is the 4d red black case appropriate to support
|
||||
@ -182,7 +181,7 @@ namespace Grid {
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
|
@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
|
||||
////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf, int sF,
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionField &in, FermionField &out) {
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
|
||||
|
||||
// Need controls to do interior, exterior, or both
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf, int sF,
|
||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionField &in, FermionField &out) {
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopDir(
|
||||
StencilImpl &st, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf, int sF,
|
||||
void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
|
||||
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteSpinor result;
|
||||
|
@ -32,40 +32,34 @@ directory
|
||||
#define GRID_QCD_DHOP_H
|
||||
|
||||
namespace Grid {
|
||||
|
||||
namespace QCD {
|
||||
namespace QCD {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper routines that implement Wilson stencil for a single site.
|
||||
// Common to both the WilsonFermion and WilsonFermion5D
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
class WilsonKernelsStatic {
|
||||
class WilsonKernelsStatic {
|
||||
public:
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
static int AsmOpt; // these are a temporary hack
|
||||
static int HandOpt; // these are a temporary hack
|
||||
};
|
||||
};
|
||||
|
||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef FermionOperator<Impl> Base;
|
||||
|
||||
public:
|
||||
public:
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||
DiracOptDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out) {
|
||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||
#ifdef AVX512
|
||||
if (AsmOpt) {
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
|
||||
in, out);
|
||||
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
} else {
|
||||
#else
|
||||
{
|
||||
@ -73,11 +67,9 @@ namespace Grid {
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
if (HandOpt)
|
||||
WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
|
||||
in, out);
|
||||
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||
else
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
|
||||
in, out);
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
@ -87,15 +79,12 @@ namespace Grid {
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
||||
DiracOptDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out) {
|
||||
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
|
||||
out);
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
@ -103,17 +92,12 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
|
||||
void>::type
|
||||
DiracOptDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out) {
|
||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
||||
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||
#ifdef AVX512
|
||||
if (AsmOpt) {
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
|
||||
Ns, in, out);
|
||||
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||
} else {
|
||||
#else
|
||||
{
|
||||
@ -121,11 +105,9 @@ namespace Grid {
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
if (HandOpt)
|
||||
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
|
||||
in, out);
|
||||
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
else
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
|
||||
sU, in, out);
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
@ -134,73 +116,48 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<
|
||||
(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
|
||||
void>::type
|
||||
DiracOptDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out) {
|
||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
|
||||
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
|
||||
|
||||
for (int site = 0; site < Ns; site++) {
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
|
||||
in, out);
|
||||
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||
sF++;
|
||||
}
|
||||
sU++;
|
||||
}
|
||||
}
|
||||
|
||||
void DiracOptDhopDir(
|
||||
StencilImpl &st, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
|
||||
int gamma);
|
||||
void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
||||
|
||||
private:
|
||||
private:
|
||||
// Specialised variants
|
||||
void DiracOptGenericDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptGenericDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptAsmDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out);
|
||||
void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||
|
||||
void DiracOptAsmDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,
|
||||
FermionField &out);
|
||||
void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptHandDhopSite(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptHandDhopSiteDag(
|
||||
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
public:
|
||||
public:
|
||||
|
||||
WilsonKernels(const ImplParams &p = ImplParams());
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
|
@ -33,31 +33,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
namespace QCD {
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Default to no assembler implementation
|
||||
///////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
///////////////////////////////////////////////////////////
|
||||
// Default to no assembler implementation
|
||||
///////////////////////////////////////////////////////////
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
{
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
{
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#if defined(AVX512)
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
@ -65,7 +61,7 @@ namespace Grid {
|
||||
#include <simd/Intel512wilson.h>
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
static Vector<vComplexF> signs;
|
||||
static Vector<vComplexF> signs;
|
||||
|
||||
int setupSigns(void ){
|
||||
Vector<vComplexF> bother(2);
|
||||
@ -84,16 +80,14 @@ namespace Grid {
|
||||
#define FX(A) WILSONASM_ ##A
|
||||
|
||||
#undef KERNEL_DAG
|
||||
template<>
|
||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#define KERNEL_DAG
|
||||
template<>
|
||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
@ -109,31 +103,26 @@ namespace Grid {
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
#undef KERNEL_DAG
|
||||
template<>
|
||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#define KERNEL_DAG
|
||||
template<>
|
||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define INSTANTIATE_ASM(A)\
|
||||
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
|
||||
commVector<SiteHalfSpinor> &buf,\
|
||||
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
||||
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
|
||||
commVector<SiteHalfSpinor> &buf,\
|
||||
\
|
||||
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
|
||||
|
||||
|
||||
INSTANTIATE_ASM(WilsonImplF);
|
||||
INSTANTIATE_ASM(WilsonImplD);
|
||||
INSTANTIATE_ASM(ZWilsonImplF);
|
||||
@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
|
||||
INSTANTIATE_ASM(DomainWallVec5dImplD);
|
||||
INSTANTIATE_ASM(ZDomainWallVec5dImplF);
|
||||
INSTANTIATE_ASM(ZDomainWallVec5dImplD);
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
|
@ -311,9 +311,8 @@ namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
@ -554,9 +553,8 @@ namespace QCD {
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
@ -798,37 +796,34 @@ namespace QCD {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Specialise Gparity to simple implementation
|
||||
////////////////////////////////////////////////
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
SiteHalfSpinor *buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
SiteHalfSpinor *buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<>
|
||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||
commVector<SiteHalfSpinor> &buf,
|
||||
template<> void
|
||||
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
assert(0);
|
||||
@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
||||
// Need Nc=3 though //
|
||||
|
||||
#define INSTANTIATE_THEM(A) \
|
||||
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
|
||||
commVector<SiteHalfSpinor> &buf,\
|
||||
int ss,int sU,const FermionField &in, FermionField &out);\
|
||||
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
|
||||
commVector<SiteHalfSpinor> &buf,\
|
||||
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
INSTANTIATE_THEM(WilsonImplF);
|
||||
|
@ -116,7 +116,7 @@ int main (int argc, char ** argv)
|
||||
else if (SE->_is_local)
|
||||
Check._odata[i] = Foo._odata[SE->_offset];
|
||||
else
|
||||
Check._odata[i] = myStencil.comm_buf[SE->_offset];
|
||||
Check._odata[i] = myStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
|
||||
Real nrmC = norm2(Check);
|
||||
@ -207,7 +207,7 @@ int main (int argc, char ** argv)
|
||||
else if (SE->_is_local)
|
||||
OCheck._odata[i] = EFoo._odata[SE->_offset];
|
||||
else
|
||||
OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
|
||||
OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
for(int i=0;i<ECheck._grid->oSites();i++){
|
||||
int permute_type;
|
||||
@ -220,7 +220,7 @@ int main (int argc, char ** argv)
|
||||
else if (SE->_is_local)
|
||||
ECheck._odata[i] = OFoo._odata[SE->_offset];
|
||||
else
|
||||
ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
|
||||
ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
|
||||
setCheckerboard(Check,ECheck);
|
||||
|
Loading…
Reference in New Issue
Block a user