1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Internal SHM comms in non-simd directions working

Need to fix simd directions
This commit is contained in:
azusayamaguchi 2016-10-22 18:14:27 +01:00
parent 0fcd2e7188
commit c190221fd3
16 changed files with 1729 additions and 1739 deletions

View File

@ -153,7 +153,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
@ -192,7 +192,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report();
if(0){
@ -262,7 +262,7 @@ int main (int argc, char ** argv)
double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report();
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@ -333,7 +333,7 @@ int main (int argc, char ** argv)
double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);

View File

@ -32,6 +32,8 @@
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
const int ShmDirectCopy = 1;
//////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so need
@ -68,7 +70,7 @@
//
//////////////////////////////////////////////////////////////////////////////////////////
namespace Grid {
namespace Grid {
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table)
@ -117,29 +119,16 @@ PARALLEL_FOR_LOOP
}
}
template<class vobj,class cobj,class compressor> void
Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
double &t_table ,double & t_data )
{
std::vector<std::pair<int,int> > table;
Gather_plane_simple_table_compute (rhs._grid,dimension,plane,cbmask,off,table);
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
Gather_plane_simple_table (table,rhs,buffer,compress,off,so);
}
struct StencilEntry {
struct StencilEntry {
uint64_t _offset;
uint64_t _byte_offset;
uint16_t _is_local;
uint16_t _permute;
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
};
};
template<class vobj,class cobj>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
template<class vobj,class cobj>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
@ -181,6 +170,14 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
reqs.resize(Packets.size());
commtime-=usecond();
for(int i=0;i<Packets.size();i++){
if( ShmDirectCopy ) {
_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes);
}else{
_grid->SendToRecvFromBegin(reqs[i],
Packets[i].send_buf,
Packets[i].to_rank,
@ -188,12 +185,16 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,cobj *buffer,int dimension
Packets[i].from_rank,
Packets[i].bytes);
}
}
commtime+=usecond();
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
commtime-=usecond();
for(int i=0;i<Packets.size();i++){
if( ShmDirectCopy )
_grid->StencilSendToRecvFromComplete(reqs[i]);
else
_grid->SendToRecvFromComplete(reqs[i]);
}
commtime+=usecond();
@ -259,7 +260,6 @@ PARALLEL_FOR_LOOP
if( _entries[i]._is_local ) {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
} else {
// PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
}
}
@ -269,7 +269,7 @@ PARALLEL_FOR_LOOP
// _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
}
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
uint64_t cbase = (uint64_t)&comm_buf[0];
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
local = _entries[ent]._is_local;
perm = _entries[ent]._permute;
if (perm) ptype = _permute_type[point];
@ -280,23 +280,26 @@ PARALLEL_FOR_LOOP
}
}
inline uint64_t GetPFInfo(int ent,uint64_t base) {
uint64_t cbase = (uint64_t)&comm_buf[0];
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
int local = _entries[ent]._is_local;
if (local) return base + _entries[ent]._byte_offset;
else return cbase + _entries[ent]._byte_offset;
}
///////////////////////////////////////////////////////////
// Comms buffers
// Unified Comms buffers for all directions
///////////////////////////////////////////////////////////
std::vector<commVector<scalar_object> > u_simd_send_buf;
std::vector<commVector<scalar_object> > u_simd_recv_buf;
commVector<cobj> u_send_buf;
commVector<cobj> comm_buf;
commVector<cobj> u_recv_buf_hide;
cobj* u_recv_buf_p;
int u_comm_offset;
int _unified_buffer_size;
cobj *CommBuf(void) { return u_recv_buf_p; }
/////////////////////////////////////////
// Timing info; ugly; possibly temporary
/////////////////////////////////////////
@ -378,7 +381,6 @@ PARALLEL_FOR_LOOP
int i = ii; // reverse direction to get SIMD comms done first
int point = i;
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
@ -426,7 +428,21 @@ PARALLEL_FOR_LOOP
}
}
u_send_buf.resize(_unified_buffer_size);
comm_buf.resize(_unified_buffer_size);
/////////////////////////////////////////////////////////////////////////////////
// Try to allocate for receiving in a shared memory region, fall back to buffer
/////////////////////////////////////////////////////////////////////////////////
if( ShmDirectCopy ) {
u_recv_buf_p=(cobj *)_grid->ShmBufferSelf();
if ( u_recv_buf_p == NULL ) {
u_recv_buf_hide.resize(_unified_buffer_size);
u_recv_buf_p=&u_recv_buf_hide[0];
}
} else {
u_recv_buf_hide.resize(_unified_buffer_size);
u_recv_buf_p=&u_recv_buf_hide[0];
}
PrecomputeByteOffsets();
@ -660,10 +676,7 @@ PARALLEL_FOR_LOOP
}
}
template<class compressor>
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
{
std::vector<std::vector<CommsRequest_t> > reqs;
calls++;
@ -675,8 +688,7 @@ PARALLEL_FOR_LOOP
CommsMerge(); // spins
}
template<class compressor>
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
{
int dimension = _directions[point];
int displacement = _distances[point];
@ -734,7 +746,6 @@ PARALLEL_FOR_LOOP
assert(source._grid==_grid);
halogtime-=usecond();
assert (comm_buf.size() == _unified_buffer_size );
u_comm_offset=0;
// Gather all comms buffers
@ -779,9 +790,6 @@ PARALLEL_FOR_LOOP
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
cobj *u_send_buf_p;
cobj *comm_buf_p;
if (comm_proc) {
int words = buffer_size;
@ -794,36 +802,48 @@ PARALLEL_FOR_LOOP
if ( !face_table_computed ) {
t_table-=usecond();
face_table.resize(face_idx+1);
cobj *ptr; ptr = &u_send_buf[0];
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
face_table[face_idx]);
t_table+=usecond();
}
t_data-=usecond();
Gather_plane_simple_table (face_table[face_idx],rhs,&u_send_buf[0],compress,u_comm_offset,so); face_idx++;
t_data+=usecond();
gathertime+=usecond();
int rank = _grid->_processor;
int recv_from_rank;
int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy
AddPacket((void *)&u_send_buf[u_comm_offset],
(void *) &comm_buf[u_comm_offset],
/////////////////////////////////////////////////////////
// try the direct copy if possible
/////////////////////////////////////////////////////////
cobj *u_send_buf_p = &u_send_buf[0];
if (ShmDirectCopy) {
cobj *shm = (cobj *) _grid->ShmBuffer(xmit_to_rank);
if ( shm!=NULL) {
u_send_buf_p = shm;
}
}
t_data-=usecond();
Gather_plane_simple_table (face_table[face_idx],rhs,u_send_buf_p,compress,u_comm_offset,so); face_idx++;
t_data+=usecond();
AddPacket((void *)&u_send_buf_p[u_comm_offset],
(void *)&u_recv_buf_p[u_comm_offset],
xmit_to_rank,
recv_from_rank,
bytes);
gathertime+=usecond();
u_comm_offset+=words;
}
}
}
template<class compressor>
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
{
@ -904,10 +924,6 @@ PARALLEL_FOR_LOOP
auto rp = &u_simd_recv_buf[i ][u_comm_offset];
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
void *vrp = (void *)rp;
void *vsp = (void *)sp;
if(nbr_proc){
int recv_from_rank;
@ -915,7 +931,7 @@ PARALLEL_FOR_LOOP
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
rpointers[i] = rp;
@ -926,13 +942,14 @@ PARALLEL_FOR_LOOP
}
}
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
assert(0);
AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
u_comm_offset +=buffer_size;
}
}
}
};
}
};
}
#endif

View File

@ -80,7 +80,6 @@ class CartesianCommunicator {
void * ShmCommBuf;
std::vector<void *> ShmCommBufs;
std::vector<void *> ShmStencilBufs;
int WorldRank;
int WorldSize;
@ -105,6 +104,10 @@ class CartesianCommunicator {
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
// Helper function for SHM Windows in MPI3
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
/////////////////////////////////
// Grid information queries
/////////////////////////////////
@ -173,6 +176,16 @@ class CartesianCommunicator {
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
{
SendToRecvFromComplete(waitall);
}
////////////////////////////////////////////////////////////
// Barrier

View File

@ -67,6 +67,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
assert(Size==_Nprocessors);
}
void *CartesianCommunicator::ShmBufferSelf(void)
{
return NULL;
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
return NULL;
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);

View File

@ -197,10 +197,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout<< "Ranks per node "<< ShmSize << std::endl;
std::cout<< "Nodes "<< GroupSize << std::endl;
std::cout<< "Ranks "<< WorldSize << std::endl;
std::cout<< "Shm CommBuf "<< ShmCommBuf << std::endl;
std::cout<<GridLogMessage<< "MPI-3 configuration: Ranks per node "<< ShmSize ;
std::cout<< " Nodes "<< GroupSize;
std::cout<< " Ranks "<< WorldSize;
std::cout<< " Shm CommBuf address"<< std::hex <<ShmCommBuf << std::dec<<std::endl;
// Done
ShmSetup=1;
@ -208,12 +208,10 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
}
ShmCommBufs.resize(ShmSize);
ShmStencilBufs.resize(ShmSize);
for(int r=0;r<ShmSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
ShmStencilBufs[r] = (void *) ((uint64_t)ShmCommBufs[r]+MAX_MPI_SHM_BYTES/4);
}
////////////////////////////////////////////////////////////////
@ -240,6 +238,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
ShmCoor.resize(_ndimension);
GroupCoor.resize(_ndimension);
WorldCoor.resize(_ndimension);
for(int l2=0;l2<log2size;l2++){
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
ShmDims[dim]*=2;
@ -347,6 +346,21 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
}
}
void *CartesianCommunicator::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
int gpeer = GroupRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
@ -355,13 +369,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from,
int bytes)
{
#undef SHM_USE_BCOPY
MPI_Request xrq;
MPI_Request rrq;
static int sequence;
int rank = _processor;
int ierr;
int tag;
int check;
@ -370,6 +382,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
sequence++;
@ -379,30 +392,23 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int small = (bytes<MAX_MPI_SHM_BYTES);
#ifndef SHM_USE_BCOPY
typedef vRealD T;
int words = bytes/sizeof(T);
assert(((size_t)bytes &(sizeof(T)-1))==0);
// assert(((size_t)xmit &(sizeof(T)-1))==0);
// assert(((size_t)recv &(sizeof(T)-1))==0);
#endif
assert(((size_t)bytes &(sizeof(T)-1))==0);
assert(gme == ShmRank);
// std::cerr << "proc dest from gme gdest "<<_processor<<" "<<dest <<" "<< from <<" "<<gme<<" "<< gdest<<std::endl; Barrier();
if ( small && (dest !=MPI_UNDEFINED) ) {
if ( small && (gdest !=MPI_UNDEFINED) ) {
assert(gme != gdest);
#ifdef SHM_USE_BCOPY
bcopy(xmit,to_ptr,bytes);
#else
T *ip = (T *)xmit;
T *op = (T *)to_ptr;
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
vstream(op[w],ip[w]);
}
#endif
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
} else {
@ -411,24 +417,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
list.push_back(xrq);
}
// std::cout << "Syncing "<<std::endl; Barrier();
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
// std::cout << "Receiving "<<std::endl; Barrier();
if (small && (from !=MPI_UNDEFINED) ) {
#ifdef SHM_USE_BCOPY
bcopy(from_ptr,recv,bytes);
#else
if (small && (gfrom !=MPI_UNDEFINED) ) {
T *ip = (T *)from_ptr;
T *op = (T *)recv;
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
vstream(op[w],ip[w]);
}
#endif
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
assert(check==sequence);
@ -439,27 +438,51 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
list.push_back(rrq);
}
// std::cout << "Syncing"<<std::endl; Barrier();
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int ierr;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
assert(gme == ShmRank);
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
#if 0
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
#endif
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();

View File

@ -33,6 +33,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
}
int Rank(void ){ return 0; };
void *CartesianCommunicator::ShmBufferSelf(void)
{
return NULL;
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
return NULL;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{

View File

@ -50,6 +50,14 @@ typedef struct HandShake_t {
static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections;
void *CartesianCommunicator::ShmBufferSelf(void)
{
return NULL;
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
return NULL;
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init();
XConnections.resize(shmem_n_pes());

View File

@ -33,8 +33,7 @@ directory
#define GRID_QCD_FERMION_OPERATOR_IMPL_H
namespace Grid {
namespace QCD {
namespace QCD {
//////////////////////////////////////////////
@ -108,13 +107,14 @@ namespace Grid {
INHERIT_GIMPL_TYPES(Base) \
INHERIT_FIMPL_TYPES(Base)
///////
/////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index
///////
/////////////////////////////////////////////////////////////////////////////
template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
class WilsonImpl
: public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
static const int Dimension = Representation::Dimension;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@ -124,7 +124,6 @@ namespace Grid {
const bool LsVectorised=false;
typedef _Coeff_t Coeff_t;
INHERIT_GIMPL_TYPES(Gimpl);
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
@ -158,8 +157,7 @@ namespace Grid {
}
template <class ref>
inline void loadLinkElement(Simd &reg,
ref &memory) {
inline void loadLinkElement(Simd &reg, ref &memory) {
reg = memory;
}
@ -202,11 +200,12 @@ namespace Grid {
}
};
///////
////////////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index, 5d redblack
///////
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
////////////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
public:
static const int Dimension = Nrepresentation;
@ -227,12 +226,9 @@ namespace Grid {
typedef Lattice<SiteSpinor> FermionField;
// Make the doubled gauge field a *scalar*
typedef iImplDoubledGaugeField<typename Simd::scalar_type>
SiteDoubledGaugeField; // This is a scalar
typedef iImplGaugeField<typename Simd::scalar_type>
SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type>
SiteScalarGaugeLink; // scalar
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
@ -250,6 +246,7 @@ namespace Grid {
inline void loadLinkElement(Simd &reg, ref &memory) {
vsplat(reg, memory);
}
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
@ -262,8 +259,8 @@ namespace Grid {
mult(&phi(), &UU(), &chi());
}
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
const GaugeField &Umu) {
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
{
SiteScalarGaugeField ScalarUmu;
SiteDoubledGaugeField ScalarUds;
@ -289,25 +286,25 @@ namespace Grid {
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
FermionField &A, int mu) {
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
{
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
FermionField &Atilde, int mu) {
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu)
{
assert(0);
}
};
};
////////////////////////////////////////////////////////////////////////////////////////
// Flavour doubled spinors; is Gparity the only? what about C*?
////////////////////////////////////////////////////////////////////////////////////////
template <class S, int Nrepresentation,class _Coeff_t = RealD>
class GparityWilsonImpl
: public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
template <class S, int Nrepresentation,class _Coeff_t = RealD>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
public:
static const int Dimension = Nrepresentation;
const bool LsVectorised=false;
@ -317,15 +314,9 @@ namespace Grid {
INHERIT_GIMPL_TYPES(Gimpl);
template <typename vtype>
using iImplSpinor =
iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype>
using iImplHalfSpinor =
iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
template <typename vtype>
using iImplDoubledGaugeField =
iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
@ -341,7 +332,6 @@ namespace Grid {
ImplParams Params;
GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
@ -351,6 +341,7 @@ namespace Grid {
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj;
@ -419,7 +410,6 @@ namespace Grid {
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{
conformable(Uds._grid,GaugeGrid);
conformable(Umu._grid,GaugeGrid);
@ -429,7 +419,6 @@ namespace Grid {
Lattice<iScalar<vInteger> > coor(GaugeGrid);
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
@ -443,8 +432,7 @@ namespace Grid {
Uconj = where(coor==neglink,-Uconj,Uconj);
}
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu) = U[ss]();
Uds[ss](1)(mu) = Uconj[ss]();
@ -458,7 +446,7 @@ namespace Grid {
Utmp = where(coor==0,Uconj,Utmp);
}
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu+4) = Utmp[ss]();
}
@ -468,7 +456,7 @@ namespace Grid {
Utmp = where(coor==0,U,Utmp);
}
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](1)(mu+4) = Utmp[ss]();
}
@ -477,13 +465,13 @@ namespace Grid {
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
FermionField &A, int mu) {
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
// DhopDir provides U or Uconj depending on coor/flavour.
GaugeLinkField link(mat._grid);
// use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for (auto ss = tmp.begin(); ss < tmp.end(); ss++) {
link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1));
}
@ -491,13 +479,13 @@ namespace Grid {
return;
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
FermionField &Atilde, int mu) {
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde._grid->_fdimensions[0];
GaugeLinkField tmp(mat._grid);
tmp = zero;
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for (int ss = 0; ss < tmp._grid->oSites(); ss++) {
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
@ -508,13 +496,13 @@ namespace Grid {
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
};
};
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
@ -535,9 +523,10 @@ namespace Grid {
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
typedef GparityWilsonImpl<vComplex, Nc> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
}
}
}}
#endif

View File

@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
////////////////////////
PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma);
}
@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
dirdisp, gamma);
}
};
@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
out);
}
} else {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
out);
}
}

View File

@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void)
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
<< " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : "
<< DhopComputeTime << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
}
if (DerivCalls > 0 || DhopCalls > 0){
@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
}
}
};
@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
assert(sF < B._grid->oSites());
assert(sU < U._grid->oSites());
Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
gamma);
Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
////////////////////////////
// spin trace outer product
@ -342,7 +334,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDeriv( GaugeField &mat,
void WilsonFermion5D<Impl>::DhopDeriv(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
@ -412,26 +404,24 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
out);
Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
}
#ifdef AVX512
} else if (stat.is_init() ) {
int nthreads;
stat.start();
#pragma omp parallel
#pragma omp parallel
{
#pragma omp master
#pragma omp master
nthreads = omp_get_num_threads();
int mythread = omp_get_thread_num();
stat.enter(mythread);
#pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++)
{
#pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++) {
int sU=ss;
int sF=LLs*sU;
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
stat.exit(mythread);
}
@ -442,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU = ss;
int sF = LLs * sU;
Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
out);
Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
}
DhopComputeTime+=usecond();

View File

@ -34,8 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/Stat.h>
namespace Grid {
namespace QCD {
namespace QCD {
////////////////////////////////////////////////////////////////////////////////
// This is the 4d red black case appropriate to support
@ -182,7 +181,7 @@ namespace Grid {
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
};
}
}
}}
#endif

View File

@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
////////////////////////////////////////////
template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf, int sF,
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out) {
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
// Need controls to do interior, exterior, or both
template <class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf, int sF,
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out) {
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
};
template <class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir(
StencilImpl &st, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf, int sF,
void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteSpinor result;

View File

@ -32,40 +32,34 @@ directory
#define GRID_QCD_DHOP_H
namespace Grid {
namespace QCD {
namespace QCD {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class WilsonKernelsStatic {
class WilsonKernelsStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOpt; // these are a temporary hack
static int HandOpt; // these are a temporary hack
};
};
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
public:
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
DiracOptDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
#ifdef AVX512
if (AsmOpt) {
WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
in, out);
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else {
#else
{
@ -73,11 +67,9 @@ namespace Grid {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if (HandOpt)
WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
in, out);
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
in, out);
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
@ -87,15 +79,12 @@ namespace Grid {
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
DiracOptDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
out);
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
sF++;
}
sU++;
@ -103,17 +92,12 @@ namespace Grid {
}
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
void>::type
DiracOptDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
#ifdef AVX512
if (AsmOpt) {
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
Ns, in, out);
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else {
#else
{
@ -121,11 +105,9 @@ namespace Grid {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if (HandOpt)
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
in, out);
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
sU, in, out);
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
@ -134,73 +116,48 @@ namespace Grid {
}
template <bool EnableBool = true>
typename std::enable_if<
(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
void>::type
DiracOptDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out) {
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
in, out);
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
}
void DiracOptDhopDir(
StencilImpl &st, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
int gamma);
void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
private:
private:
// Specialised variants
void DiracOptGenericDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptAsmDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out);
void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void DiracOptAsmDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,
FermionField &out);
void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void DiracOptHandDhopSite(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(
StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
public:
public:
WilsonKernels(const ImplParams &p = ImplParams());
};
}
}
};
}}
#endif

View File

@ -33,31 +33,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
namespace QCD {
namespace QCD {
///////////////////////////////////////////////////////////
// Default to no assembler implementation
///////////////////////////////////////////////////////////
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
///////////////////////////////////////////////////////////
// Default to no assembler implementation
///////////////////////////////////////////////////////////
template<class Impl> void
WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
{
assert(0);
}
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
}
template<class Impl> void
WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
{
assert(0);
}
}
#if defined(AVX512)
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine
///////////////////////////////////////////////////////////
@ -65,7 +61,7 @@ namespace Grid {
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
static Vector<vComplexF> signs;
static Vector<vComplexF> signs;
int setupSigns(void ){
Vector<vComplexF> bother(2);
@ -84,16 +80,14 @@ namespace Grid {
#define FX(A) WILSONASM_ ##A
#undef KERNEL_DAG
template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@ -109,31 +103,26 @@ namespace Grid {
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG
template<>
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<>
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif
#define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
commVector<SiteHalfSpinor> &buf,\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
commVector<SiteHalfSpinor> &buf,\
\
template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
INSTANTIATE_ASM(WilsonImplF);
INSTANTIATE_ASM(WilsonImplD);
INSTANTIATE_ASM(ZWilsonImplF);
@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
INSTANTIATE_ASM(DomainWallVec5dImplD);
INSTANTIATE_ASM(ZDomainWallVec5dImplF);
INSTANTIATE_ASM(ZDomainWallVec5dImplD);
}
}
}}

View File

@ -311,9 +311,8 @@ namespace Grid {
namespace QCD {
template<class Impl>
void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<class Impl> void
WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
@ -554,9 +553,8 @@ namespace QCD {
}
}
template<class Impl>
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<class Impl>
void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// std::cout << "Hand op Dhop "<<std::endl;
@ -798,37 +796,34 @@ namespace QCD {
}
}
////////////////////////////////////////////////
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
template<>
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
assert(0);
}
template<>
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
assert(0);
}
template<>
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
assert(0);
}
template<>
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
commVector<SiteHalfSpinor> &buf,
template<> void
WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
assert(0);
@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
// Need Nc=3 though //
#define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
commVector<SiteHalfSpinor> &buf,\
int ss,int sU,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
commVector<SiteHalfSpinor> &buf,\
template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(WilsonImplF);

View File

@ -116,7 +116,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local)
Check._odata[i] = Foo._odata[SE->_offset];
else
Check._odata[i] = myStencil.comm_buf[SE->_offset];
Check._odata[i] = myStencil.CommBuf()[SE->_offset];
}
Real nrmC = norm2(Check);
@ -207,7 +207,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local)
OCheck._odata[i] = EFoo._odata[SE->_offset];
else
OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
}
for(int i=0;i<ECheck._grid->oSites();i++){
int permute_type;
@ -220,7 +220,7 @@ int main (int argc, char ** argv)
else if (SE->_is_local)
ECheck._odata[i] = OFoo._odata[SE->_offset];
else
ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
}
setCheckerboard(Check,ECheck);