mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-05 06:19:31 +00:00
Compare commits
14 Commits
9203126aa5
...
7aa06329d0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7aa06329d0 | ||
|
|
9d6a38c44c | ||
|
|
6ec5cee368 | ||
|
|
f2e9a68825 | ||
|
|
d88750e6b6 | ||
|
|
821358eda7 | ||
|
|
fce6e1f135 | ||
|
|
8f0bb3e676 | ||
|
|
262c70d967 | ||
|
|
da43ef7c2d | ||
|
|
7b60ab5df1 | ||
|
|
f6b961a64e | ||
|
|
f1ed988aa3 | ||
|
|
eea51bb604 |
@@ -51,11 +51,13 @@ directory
|
||||
#pragma nv_diag_suppress cast_to_qualified_type
|
||||
//disables nvcc specific warning in many files
|
||||
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
||||
#pragma nv_diag_suppress declared_but_not_referenced
|
||||
#pragma nv_diag_suppress extra_semicolon
|
||||
#else
|
||||
//disables nvcc specific warning in json.hpp
|
||||
#pragma diag_suppress unsigned_compare_with_zero
|
||||
#pragma diag_suppress cast_to_qualified_type
|
||||
#pragma diag_suppress declared_but_not_referenced
|
||||
//disables nvcc specific warning in many files
|
||||
#pragma diag_suppress esa_on_defaulted_function_ignored
|
||||
#pragma diag_suppress extra_semicolon
|
||||
|
||||
@@ -183,6 +183,7 @@ public:
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
int IsOffNode(int rank);
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
@@ -201,9 +202,9 @@ public:
|
||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
|
||||
|
||||
@@ -395,11 +395,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
int CartesianCommunicator::IsOffNode(int rank)
|
||||
{
|
||||
int grank = ShmRanks[rank];
|
||||
if ( grank == MPI_UNDEFINED ) return true;
|
||||
else return false;
|
||||
}
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
@@ -414,9 +419,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
@@ -440,7 +445,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
|
||||
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
@@ -449,6 +455,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
else {
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
}
|
||||
#endif
|
||||
@@ -457,7 +464,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
@@ -676,9 +683,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
|
||||
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
||||
dest=0;
|
||||
}
|
||||
|
||||
int CartesianCommunicator::IsOffNode(int rank) { return false; }
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
|
||||
@@ -543,37 +543,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
#if 0
|
||||
HostCommBuf= acceleratorAllocHost(bytes);
|
||||
#else
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
#if 0
|
||||
#warning "Moving host buffers to specific NUMA domain"
|
||||
int numa;
|
||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||
if(numa_name) {
|
||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
||||
numa = atoi(numa_name);
|
||||
unsigned long page_count = bytes/page_size;
|
||||
std::vector<void *> pages(page_count);
|
||||
std::vector<int> nodes(page_count,numa);
|
||||
std::vector<int> status(page_count,-1);
|
||||
for(unsigned long p=0;p<page_count;p++){
|
||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
||||
}
|
||||
int ret = move_pages(0,
|
||||
page_count,
|
||||
&pages[0],
|
||||
&nodes[0],
|
||||
&status[0],
|
||||
MPOL_MF_MOVE);
|
||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
||||
if (ret) perror(" move_pages failed for reason:");
|
||||
}
|
||||
#endif
|
||||
acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
|
||||
// acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
@@ -1039,11 +1010,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
||||
{
|
||||
int gpeer = ShmRanks[rank];
|
||||
assert(gpeer!=ShmRank); // never send to self
|
||||
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
|
||||
return (void *) remote;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -154,6 +154,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
||||
@@ -179,6 +179,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
||||
@@ -146,6 +146,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
||||
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Wilson compressor will need FaceGather policies for:
|
||||
// Periodic, Dirichlet, and partial Dirichlet for DWF
|
||||
///////////////////////////////////////////////////////////////
|
||||
const int dwf_compressor_depth=2;
|
||||
#define DWF_COMPRESS
|
||||
class FaceGatherPartialDWF
|
||||
{
|
||||
public:
|
||||
#ifdef DWF_COMPRESS
|
||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
||||
#else
|
||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||
#endif
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
int off,int so,int partial)
|
||||
{
|
||||
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
|
||||
// Shrinks local and remote comms buffers
|
||||
GridBase *Grid = rhs.Grid();
|
||||
int Ls = Grid->_rdimensions[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth=Ls/2;
|
||||
#endif
|
||||
std::pair<int,int> *table_v = & table[0];
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
int vol=table.size()/Ls;
|
||||
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
|
||||
Integer i=idx/Ls;
|
||||
Integer s=idx%Ls;
|
||||
Integer sc=depth+s-(Ls-depth);
|
||||
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
|
||||
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
|
||||
});
|
||||
rhs_v.ViewClose();
|
||||
}
|
||||
template<class decompressor,class Decompression>
|
||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
||||
{
|
||||
auto Ls = dd.dims[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth=Ls/2;
|
||||
#endif
|
||||
// Just pass in the Grid
|
||||
auto kp = dd.kernel_p;
|
||||
auto mp = dd.mpi_p;
|
||||
int size= dd.buffer_size;
|
||||
int vol= size/Ls;
|
||||
accelerator_forNB(o,size,1,{
|
||||
int idx=o/Ls;
|
||||
int s=o%Ls;
|
||||
if ( s < depth ) {
|
||||
int oo=s*vol+idx;
|
||||
kp[o]=mp[oo];
|
||||
} else if ( s >= Ls-depth ) {
|
||||
int sc = depth + s - (Ls-depth);
|
||||
int oo=sc*vol+idx;
|
||||
kp[o]=mp[oo];
|
||||
} else {
|
||||
kp[o] = Zero();//fill rest with zero if partial dirichlet
|
||||
}
|
||||
});
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Need to gather *interior portions* for ALL s-slices in simd directions
|
||||
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
|
||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
GridBase *Grid = rhs.Grid();
|
||||
int Ls = Grid->_rdimensions[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth = Ls/2;
|
||||
#endif
|
||||
|
||||
// insertion of zeroes...
|
||||
assert( (table.size()&0x1)==0);
|
||||
int num=table.size()/2;
|
||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
auto p0=&pointers[0][0];
|
||||
auto p1=&pointers[1][0];
|
||||
auto tp=&table[0];
|
||||
int nnum=num/Ls;
|
||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
||||
// Reorders both local and remote comms buffers
|
||||
//
|
||||
int s = j % Ls;
|
||||
int sp1 = (s+depth)%Ls; // peri incremented s slice
|
||||
|
||||
int hxyz= j/Ls;
|
||||
|
||||
int xyz0= hxyz*2; // xyzt part of coor
|
||||
int xyz1= hxyz*2+1;
|
||||
|
||||
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
||||
|
||||
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
|
||||
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
|
||||
compress.CompressExchange(p0[jj],p1[jj],
|
||||
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
|
||||
rhs_v[so+tp[kk1 ].second],
|
||||
type);
|
||||
});
|
||||
rhs_v.ViewClose();
|
||||
}
|
||||
// Merge routine is for SIMD faces
|
||||
template<class decompressor,class Merger>
|
||||
static void MergeFace(decompressor decompress,Merger &mm)
|
||||
{
|
||||
auto Ls = mm.dims[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth = Ls/2;
|
||||
#endif
|
||||
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
|
||||
auto mp = &mm.mpointer[0];
|
||||
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
|
||||
auto vp1= &mm.vpointers[1][0];
|
||||
auto type= mm.type;
|
||||
int nnum = num/Ls;
|
||||
accelerator_forNB(o,num,Merger::Nsimd,{
|
||||
|
||||
int s=o%Ls;
|
||||
int hxyz=o/Ls; // xyzt related component
|
||||
int xyz0=hxyz*2;
|
||||
int xyz1=hxyz*2+1;
|
||||
|
||||
int sp = (s+depth)%Ls;
|
||||
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
||||
|
||||
int oo0= s+xyz0*Ls;
|
||||
int oo1= s+xyz1*Ls;
|
||||
|
||||
// same ss0, ss1 pair goes to new layout
|
||||
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
|
||||
});
|
||||
}
|
||||
};
|
||||
class FaceGatherDWFMixedBCs
|
||||
{
|
||||
public:
|
||||
#ifdef DWF_COMPRESS
|
||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
||||
#else
|
||||
static int PartialCompressionFactor(GridBase *grid) {return 1;}
|
||||
#endif
|
||||
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
int off,int so,int partial)
|
||||
{
|
||||
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
|
||||
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
|
||||
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
||||
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
||||
}
|
||||
template<class decompressor,class Merger>
|
||||
static void MergeFace(decompressor decompress,Merger &mm)
|
||||
{
|
||||
int partial = mm.partial;
|
||||
// std::cout << " merge DWF partial "<<partial <<std::endl;
|
||||
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
|
||||
else FaceGatherSimple::MergeFace(decompress,mm);
|
||||
}
|
||||
|
||||
template<class decompressor,class Decompression>
|
||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
||||
{
|
||||
int partial = dd.partial;
|
||||
// std::cout << " decompress DWF partial "<<partial <<std::endl;
|
||||
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
|
||||
else FaceGatherSimple::DecompressFace(decompress,dd);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// optimised versions supporting half precision too??? Deprecate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -242,8 +39,7 @@ public:
|
||||
|
||||
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
|
||||
// : public FaceGatherSimple
|
||||
class WilsonCompressorTemplate : public FaceGatherSimple
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
@@ -165,6 +165,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
||||
@@ -204,7 +204,14 @@ public:
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
|
||||
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
||||
@@ -30,25 +30,26 @@
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
uint64_t DslashFullCount;
|
||||
uint64_t DslashPartialCount;
|
||||
//uint64_t DslashPartialCount;
|
||||
uint64_t DslashDirichletCount;
|
||||
|
||||
void DslashResetCounts(void)
|
||||
{
|
||||
DslashFullCount=0;
|
||||
DslashPartialCount=0;
|
||||
// DslashPartialCount=0;
|
||||
DslashDirichletCount=0;
|
||||
}
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
||||
{
|
||||
dirichlet = DslashDirichletCount;
|
||||
partial = DslashPartialCount;
|
||||
partial = 0;
|
||||
full = DslashFullCount;
|
||||
}
|
||||
void DslashLogFull(void) { DslashFullCount++;}
|
||||
void DslashLogPartial(void) { DslashPartialCount++;}
|
||||
//void DslashLogPartial(void) { DslashPartialCount++;}
|
||||
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
||||
|
||||
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
|
||||
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table)
|
||||
|
||||
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
// These can move into a params header and be given MacroMagic serialisation
|
||||
struct DefaultImplParams {
|
||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
DefaultImplParams() {
|
||||
dirichlet.resize(0);
|
||||
partialDirichlet=0;
|
||||
// partialDirichlet=0;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -69,6 +69,12 @@ struct DefaultImplParams {
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table);
|
||||
|
||||
class StencilBuffer
|
||||
{
|
||||
public:
|
||||
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
|
||||
};
|
||||
|
||||
void DslashResetCounts(void);
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||
void DslashLogFull(void);
|
||||
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
|
||||
///////////////////////////////////////////////////
|
||||
// If true, this is partially communicated per face
|
||||
///////////////////////////////////////////////////
|
||||
StencilVector _comms_partial_send;
|
||||
StencilVector _comms_partial_recv;
|
||||
// StencilVector _comms_partial_send;
|
||||
// StencilVector _comms_partial_recv;
|
||||
//
|
||||
StencilVector _comm_buf_size;
|
||||
StencilVector _permute_type;
|
||||
@@ -205,16 +211,16 @@ public:
|
||||
struct Packet {
|
||||
void * send_buf;
|
||||
void * recv_buf;
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
#endif
|
||||
void * compressed_send_buf;
|
||||
void * compressed_recv_buf;
|
||||
Integer to_rank;
|
||||
Integer from_rank;
|
||||
Integer do_send;
|
||||
Integer do_recv;
|
||||
Integer xbytes;
|
||||
Integer rbytes;
|
||||
Integer xbytes_compressed;
|
||||
Integer rbytes_compressed;
|
||||
};
|
||||
struct Merge {
|
||||
static constexpr int Nsimd = vobj::Nsimd();
|
||||
@@ -223,7 +229,7 @@ public:
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer type;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct Decompress {
|
||||
@@ -231,7 +237,7 @@ public:
|
||||
cobj * kernel_p;
|
||||
cobj * mpi_p;
|
||||
Integer buffer_size;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct CopyReceiveBuffer {
|
||||
@@ -252,9 +258,45 @@ public:
|
||||
|
||||
protected:
|
||||
GridBase * _grid;
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Sloppy comms will make a second buffer upon comms
|
||||
///////////////////////////////////////////////////
|
||||
size_t device_heap_top; //
|
||||
size_t device_heap_bytes;//
|
||||
size_t device_heap_size; //
|
||||
void *DeviceBufferMalloc(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *)device_heap_top;
|
||||
device_heap_top += bytes;
|
||||
device_heap_bytes+= bytes;
|
||||
if ( device_heap_bytes > device_heap_size ) {
|
||||
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
|
||||
assert (device_heap_bytes <= device_heap_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void DeviceBufferFreeAll(void)
|
||||
{
|
||||
device_heap_size = _unified_buffer_size*sizeof(cobj);
|
||||
// Resize up if necessary, never down
|
||||
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
|
||||
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
|
||||
}
|
||||
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
|
||||
device_heap_size = StencilBuffer::DeviceCommBuf.size();
|
||||
device_heap_bytes=0;
|
||||
}
|
||||
|
||||
public:
|
||||
GridBase *Grid(void) const { return _grid; }
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// Control reduced precision comms
|
||||
/////////////////////////////////////////////////////////
|
||||
int SloppyComms;
|
||||
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||
@@ -268,7 +310,7 @@ public:
|
||||
}
|
||||
|
||||
int face_table_computed;
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
int fullDirichlet;
|
||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||
deviceVector<int> surface_list;
|
||||
@@ -361,24 +403,145 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Non blocking send and receive. Necessarily parallel.
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void DecompressPacket(Packet &packet)
|
||||
{
|
||||
if ( !SloppyComms ) return;
|
||||
|
||||
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.rbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
// Can either choose to represent as float vs double and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
double *dbuf =(double *) packet.recv_buf;
|
||||
float *fbuf =(float *) packet.compressed_recv_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
|
||||
});
|
||||
|
||||
} else if ( sizeof(word)==4){
|
||||
// Can either choose to represent as half vs float and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||
});
|
||||
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
}
|
||||
}
|
||||
void CompressPacket(Packet &packet)
|
||||
{
|
||||
packet.xbytes_compressed = packet.xbytes;
|
||||
packet.compressed_send_buf = packet.send_buf;
|
||||
|
||||
packet.rbytes_compressed = packet.rbytes;
|
||||
packet.compressed_recv_buf = packet.recv_buf;
|
||||
|
||||
if ( !SloppyComms ) {
|
||||
return;
|
||||
}
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.xbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||
|
||||
packet.rbytes_compressed = packet.rbytes/2;
|
||||
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
|
||||
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||
|
||||
}
|
||||
//else {
|
||||
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||
// }
|
||||
|
||||
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
|
||||
|
||||
packet.xbytes_compressed = packet.xbytes/2;
|
||||
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
|
||||
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
double *dbuf =(double *) packet.send_buf;
|
||||
float *fbuf =(float *) packet.compressed_send_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
|
||||
});
|
||||
|
||||
} else if ( sizeof(word)==4){
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
|
||||
});
|
||||
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
|
||||
}
|
||||
// else {
|
||||
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||
// }
|
||||
|
||||
return;
|
||||
}
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
// std::cout << "Communicate Begin "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
FlightRecorder::StepLog("Communicate begin");
|
||||
///////////////////////////////////////////////
|
||||
// All GPU kernel tasks must complete
|
||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||
// But the HaloGather had a barrier too.
|
||||
// accelerator_barrier(); All kernels should ALREADY be complete
|
||||
//Everyone is here, so noone running slow and still using receive buffer
|
||||
_grid->StencilBarrier();
|
||||
// But the HaloGather had a barrier too.
|
||||
///////////////////////////////////////////////
|
||||
if (SloppyComms) {
|
||||
DeviceBufferFreeAll();
|
||||
}
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
this->CompressPacket(Packets[i]);
|
||||
}
|
||||
if (SloppyComms) {
|
||||
accelerator_barrier();
|
||||
#ifdef NVLINK_GET
|
||||
_grid->StencilBarrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].compressed_send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].compressed_recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
}
|
||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
@@ -389,19 +552,22 @@ public:
|
||||
// Starts intranode
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].send_buf,Packets[i].compressed_send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].recv_buf,Packets[i].compressed_recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
// std::cout << "Communicate Begin started "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
}
|
||||
FlightRecorder::StepLog("Communicate begin has finished");
|
||||
// Get comms started then run checksums
|
||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if ( Packets[i].do_send )
|
||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
||||
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -416,14 +582,15 @@ public:
|
||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||
// accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
this->DecompressPacket(Packets[i]);
|
||||
if ( Packets[i].do_recv )
|
||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||
}
|
||||
FlightRecorder::StepLog("Finish communicate complete");
|
||||
}
|
||||
@@ -618,7 +785,7 @@ public:
|
||||
}
|
||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||
Decompress d;
|
||||
d.partial = this->partialDirichlet;
|
||||
// d.partial = this->partialDirichlet;
|
||||
d.dims = _grid->_fdimensions;
|
||||
d.kernel_p = k_p;
|
||||
d.mpi_p = m_p;
|
||||
@@ -627,7 +794,7 @@ public:
|
||||
}
|
||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
Merge m;
|
||||
m.partial = this->partialDirichlet;
|
||||
// m.partial = this->partialDirichlet;
|
||||
m.dims = _grid->_fdimensions;
|
||||
m.type = type;
|
||||
m.mpointer = merge_p;
|
||||
@@ -732,8 +899,8 @@ public:
|
||||
int block = dirichlet_block[dimension];
|
||||
this->_comms_send[ii] = comm_dim;
|
||||
this->_comms_recv[ii] = comm_dim;
|
||||
this->_comms_partial_send[ii] = 0;
|
||||
this->_comms_partial_recv[ii] = 0;
|
||||
// this->_comms_partial_send[ii] = 0;
|
||||
// this->_comms_partial_recv[ii] = 0;
|
||||
if ( block && comm_dim ) {
|
||||
assert(abs(displacement) < ld );
|
||||
// Quiesce communication across block boundaries
|
||||
@@ -754,10 +921,10 @@ public:
|
||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||
}
|
||||
if ( partialDirichlet ) {
|
||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
}
|
||||
// if ( partialDirichlet ) {
|
||||
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -769,6 +936,7 @@ public:
|
||||
Parameters p=Parameters(),
|
||||
bool preserve_shm=false)
|
||||
{
|
||||
SloppyComms = 0;
|
||||
face_table_computed=0;
|
||||
_grid = grid;
|
||||
this->parameters=p;
|
||||
@@ -786,7 +954,7 @@ public:
|
||||
this->same_node.resize(npoints);
|
||||
|
||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||
partialDirichlet = p.partialDirichlet;
|
||||
// partialDirichlet = p.partialDirichlet;
|
||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||
fullDirichlet=0;
|
||||
for(int d=0;d<p.dirichlet.size();d++){
|
||||
@@ -867,7 +1035,7 @@ public:
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
// Allow for multiple stencils to exist simultaneously
|
||||
// Allow for multiple stencils to be communicated simultaneously
|
||||
if (!preserve_shm)
|
||||
_grid->ShmBufferFreeAll();
|
||||
|
||||
@@ -935,7 +1103,8 @@ public:
|
||||
GridBase *grid=_grid;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@@ -1124,8 +1293,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
assert(rhs.Grid()==_grid);
|
||||
// conformable(_grid,rhs.Grid());
|
||||
@@ -1160,11 +1329,11 @@ public:
|
||||
int rbytes;
|
||||
|
||||
if ( comms_send ) xbytes = bytes; // Full send
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0; // full dirichlet
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
@@ -1191,7 +1360,8 @@ public:
|
||||
}
|
||||
|
||||
|
||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
if ( compress.DecompressionStep()&&comms_recv) {
|
||||
recv_buf=u_simd_recv_buf[0];
|
||||
} else {
|
||||
recv_buf=this->u_recv_buf_p;
|
||||
@@ -1225,7 +1395,8 @@ public:
|
||||
#endif
|
||||
|
||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||
|
||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||
if ( !duplicate ) { // Force comms for now
|
||||
@@ -1234,8 +1405,8 @@ public:
|
||||
// Build a list of things to do after we synchronise GPUs
|
||||
// Start comms now???
|
||||
///////////////////////////////////////////////////////////
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
int do_recv = (comms_send) && (!shm_recv );
|
||||
AddPacket((void *)&send_buf[comm_off],
|
||||
(void *)&recv_buf[comm_off],
|
||||
xmit_to_rank, do_send,
|
||||
@@ -1243,7 +1414,7 @@ public:
|
||||
xbytes,rbytes);
|
||||
}
|
||||
|
||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
||||
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||
&recv_buf[comm_off],
|
||||
words,Decompressions);
|
||||
@@ -1265,8 +1436,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@@ -1341,18 +1512,20 @@ public:
|
||||
|
||||
|
||||
if ( comms_send ) xbytes = bytes;
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0;
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
// Gathers SIMD lanes for send and merge
|
||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
if ( comms_send || comms_recv ) {
|
||||
|
||||
int partial = partialDirichlet;
|
||||
// int partial = partialDirichlet;
|
||||
int partial = 0;
|
||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||
spointers,dimension,sx,cbmask,
|
||||
compress,permute_type,partial );
|
||||
@@ -1418,7 +1591,8 @@ public:
|
||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||
}
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
AddPacket((void *)sp,(void *)rp,
|
||||
xmit_to_rank,do_send,
|
||||
recv_from_rank,do_send,
|
||||
@@ -1432,7 +1606,8 @@ public:
|
||||
}
|
||||
}
|
||||
// rpointer may be doing a remote read in the gather over SHM
|
||||
if ( comms_recv|comms_partial_recv ) {
|
||||
// if ( comms_recv|comms_partial_recv ) {
|
||||
if ( comms_recv ) {
|
||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
}
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ void acceleratorInit(void)
|
||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||
|
||||
|
||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
||||
GPU_PROP_FMT(totalGlobalMem,"%zu");
|
||||
GPU_PROP(managedMemory);
|
||||
GPU_PROP(isMultiGpuBoard);
|
||||
GPU_PROP(warpSize);
|
||||
|
||||
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
|
||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
|
||||
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
Params.dirichlet=NonDirichlet;
|
||||
ParamsDir.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=0;
|
||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
// ParamsDir.partialDirichlet=0;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
|
||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||
else ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
else ParamsNum.partialDirichlet = 0;
|
||||
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
// else ParamsNum.partialDirichlet = 0;
|
||||
|
||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
else ParamsDen.partialDirichlet = 0;
|
||||
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
// else ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
||||
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
|
||||
ParamsF.dirichlet=NonDirichlet;
|
||||
ParamsDir.dirichlet=Dirichlet;
|
||||
ParamsDirF.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=1;
|
||||
ParamsDirF.partialDirichlet=1;
|
||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
// ParamsDir.partialDirichlet=1;
|
||||
// ParamsDirF.partialDirichlet=1;
|
||||
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
|
||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||
else ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
else ParamsNum.partialDirichlet = 0;
|
||||
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
// else ParamsNum.partialDirichlet = 0;
|
||||
|
||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
else ParamsDen.partialDirichlet = 0;
|
||||
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
// else ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||
|
||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||
|
||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||
|
||||
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
header();
|
||||
|
||||
for(int lat=8;lat<=maxlat;lat+=4){
|
||||
for(int Ls=8;Ls<=8;Ls*=2){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
|
||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
|
||||
double dbytes;
|
||||
for(int i=0;i<Nloop;i++){
|
||||
double start=usecond();
|
||||
|
||||
dbytes=0;
|
||||
ncomm=0;
|
||||
|
||||
std::vector<CommsRequest_t> requests;
|
||||
|
||||
for(int mu=0;mu<4;mu++){
|
||||
|
||||
|
||||
if (mpi_layout[mu]>1 ) {
|
||||
|
||||
ncomm++;
|
||||
int comm_proc=1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu);
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu+4);
|
||||
|
||||
}
|
||||
}
|
||||
Grid.StencilSendToRecvFromComplete(requests,0);
|
||||
Grid.Barrier();
|
||||
double stop=usecond();
|
||||
t_time[i] = stop-start; // microseconds
|
||||
|
||||
}
|
||||
|
||||
timestat.statistics(t_time);
|
||||
|
||||
dbytes=dbytes*ppn;
|
||||
double xbytes = dbytes*0.5;
|
||||
// double rbytes = dbytes*0.5;
|
||||
double bidibytes = dbytes;
|
||||
|
||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
header();
|
||||
|
||||
for(int lat=8;lat<=maxlat;lat+=4){
|
||||
for(int Ls=8;Ls<=8;Ls*=2){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank/Nnode;
|
||||
|
||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||
Grid.ShmBufferFreeAll();
|
||||
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||
for(int d=0;d<8;d++){
|
||||
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
double dbytes;
|
||||
for(int i=0;i<Nloop;i++){
|
||||
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
|
||||
std::vector<CommsRequest_t> requests;
|
||||
dbytes=0;
|
||||
ncomm=0;
|
||||
for(int mu=0;mu<4;mu++){
|
||||
|
||||
|
||||
for(int dir=0;dir<8;dir++) {
|
||||
|
||||
double tbytes;
|
||||
int mu =dir % 4;
|
||||
|
||||
if (mpi_layout[mu]>1 ) {
|
||||
|
||||
ncomm++;
|
||||
int comm_proc=1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu);
|
||||
Grid.StencilSendToRecvFromComplete(requests,mu);
|
||||
requests.resize(0);
|
||||
if ( dir == mu ) {
|
||||
int comm_proc=1;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
} else {
|
||||
int comm_proc = mpi_layout[mu]-1;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
}
|
||||
int tid = omp_get_thread_num();
|
||||
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
|
||||
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu+4);
|
||||
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
||||
requests.resize(0);
|
||||
|
||||
dbytes+=tbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop=usecond();
|
||||
t_time[i] = stop-start; // microseconds
|
||||
|
||||
}
|
||||
|
||||
timestat.statistics(t_time);
|
||||
|
||||
@@ -32,18 +32,18 @@
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
////////////////////////
|
||||
/// Move to domains ////
|
||||
////////////////////////
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls=8;
|
||||
for(int i=0;i<argc;i++)
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++) {
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// With comms
|
||||
//////////////////
|
||||
Coordinate Dirichlet(Nd+1,0);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
/*
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
Coordinate shm;
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
*/
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
#undef SINGLE
|
||||
#ifdef SINGLE
|
||||
typedef vComplexF Simd;
|
||||
typedef LatticeFermionF FermionField;
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#else
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
|
||||
FermionField src (FGrid); random(RNG5,src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
|
||||
LatticeFermion result(FGrid); result=Zero();
|
||||
LatticeFermion ref(FGrid); ref=Zero();
|
||||
LatticeFermion tmp(FGrid);
|
||||
LatticeFermion err(FGrid);
|
||||
FermionField result(FGrid); result=Zero();
|
||||
FermionField ref(FGrid); ref=Zero();
|
||||
FermionField tmp(FGrid);
|
||||
FermionField err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
GaugeField Umu(UGrid);
|
||||
GaugeField UmuCopy(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
// SU<Nc>::ColdConfiguration(Umu);
|
||||
UmuCopy=Umu;
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrix ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Apply BCs
|
||||
////////////////////////////////////
|
||||
Coordinate Block(4);
|
||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
|
||||
DirichletFilter<GaugeField> Filter(Block);
|
||||
Filter.applyFilter(Umu);
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
LatticeGaugeField Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||
{
|
||||
autoView( Umu5d_v, Umu5d, CpuWrite);
|
||||
autoView( Umu_v , Umu , CpuRead);
|
||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<ColourMatrixField> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
|
||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
int ncall =1000;
|
||||
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
Dw.SloppyComms(sloppy);
|
||||
Dw.ImportGauge(Umu);
|
||||
|
||||
int ncall =300;
|
||||
RealD n2e;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
auto nsimd = Simd::Nsimd();
|
||||
auto simdwidth = sizeof(Simd);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
//exit(0);
|
||||
n2e = norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
if(( norm2(err)>1.0e-4) ) {
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
if(( n2e>1.0e-4) ) {
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
std::cout<<GridLogMessage << "RESULT" << std::endl;
|
||||
// std::cout << result<<std::endl;
|
||||
std::cout << norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "REF" << std::endl;
|
||||
std::cout << norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "ERR" << std::endl;
|
||||
std::cout << norm2(err)<<std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
assert (n2e< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
for(int i=0;i<ref_v.size();i++){
|
||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
int i=s+Ls*ss;
|
||||
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src,result,1);
|
||||
|
||||
Dw.Dhop(src,result,DaggerYes);
|
||||
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermion src_e (FrbGrid);
|
||||
LatticeFermion src_o (FrbGrid);
|
||||
LatticeFermion r_e (FrbGrid);
|
||||
LatticeFermion r_o (FrbGrid);
|
||||
LatticeFermion r_eo (FGrid);
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
assert((n2e)<1.0e-4);
|
||||
|
||||
FermionField src_e (FrbGrid);
|
||||
FermionField src_o (FrbGrid);
|
||||
FermionField r_e (FrbGrid);
|
||||
FermionField r_o (FrbGrid);
|
||||
FermionField r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==10) cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==20) cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet);
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
/*
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
|
||||
//////////////////////
|
||||
// Node level
|
||||
//////////////////////
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
*/
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
#else
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE2
|
||||
typedef vComplexD2 Simd;
|
||||
typedef LatticeFermionD2 FermionField;
|
||||
typedef LatticeGaugeFieldD2 GaugeField;
|
||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
||||
typedef DomainWallFermionD2 FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
Dw.SloppyComms(sloppy);
|
||||
Dw.ImportGauge(Umu);
|
||||
|
||||
int ncall =300;
|
||||
|
||||
@@ -1,465 +0,0 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
////////////////////////
|
||||
/// Move to domains ////
|
||||
////////////////////////
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
int Ls=8;
|
||||
for(int i=0;i<argc;i++) {
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// With comms
|
||||
//////////////////
|
||||
Coordinate Dirichlet(Nd+1,0);
|
||||
|
||||
for(auto partial : {0}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
//Coordinate shm({2,1,1,1});
|
||||
Coordinate shm;
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
|
||||
|
||||
//////////////////////
|
||||
// Node level
|
||||
//////////////////////
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
for(auto partial : {0,1}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
|
||||
for(auto partial : {0,1}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
#define SINGLE
|
||||
#ifdef SINGLE
|
||||
typedef vComplexF Simd;
|
||||
typedef LatticeFermionF FermionField;
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE2
|
||||
typedef vComplexD2 Simd;
|
||||
typedef LatticeFermionD2 FermionField;
|
||||
typedef LatticeGaugeFieldD2 GaugeField;
|
||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
||||
typedef DomainWallFermionD2 FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
|
||||
|
||||
FermionField src (FGrid); random(RNG5,src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
FermionField result(FGrid); result=Zero();
|
||||
FermionField ref(FGrid); ref=Zero();
|
||||
FermionField tmp(FGrid);
|
||||
FermionField err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
GaugeField Umu(UGrid);
|
||||
GaugeField UmuFull(UGrid);
|
||||
GaugeField UmuCopy(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
UmuCopy=Umu;
|
||||
UmuFull=Umu;
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
|
||||
////////////////////////////////////
|
||||
// Apply BCs
|
||||
////////////////////////////////////
|
||||
Coordinate Block(4);
|
||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
|
||||
DirichletFilter<GaugeField> Filter(Block);
|
||||
Filter.applyFilter(Umu);
|
||||
if(!partial) Filter.applyFilter(UmuCopy);
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
std::vector<ColourMatrixField> U(4,UGrid);
|
||||
std::vector<ColourMatrixField> Ucopy(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
int depth=dwf_compressor_depth;
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
p.partialDirichlet=partial;
|
||||
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
|
||||
int ncall =1;
|
||||
RealD n2e;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src,result,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = Simd::Nsimd();
|
||||
auto simdwidth = sizeof(Simd);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
err = ref-result;
|
||||
n2e = norm2(err);
|
||||
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
if(( n2e>1.0e-4) ) {
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
|
||||
DumpSliceNorm("s-slice ref ",ref,1);
|
||||
DumpSliceNorm("s-slice res ",result,1);
|
||||
DumpSliceNorm("s-slice error ",err,1);
|
||||
exit(-1);
|
||||
}
|
||||
assert (n2e< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
int depth=dwf_compressor_depth;
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
Dw.Dhop(src,result,DaggerYes);
|
||||
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
assert((n2e)<1.0e-4);
|
||||
|
||||
FermionField src_e (FrbGrid);
|
||||
FermionField src_o (FrbGrid);
|
||||
FermionField r_e (FrbGrid);
|
||||
FermionField r_o (FrbGrid);
|
||||
FermionField r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
||||
}
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||
Dw.Dhop (src ,result,DaggerNo);
|
||||
|
||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
||||
|
||||
setCheckerboard(r_eo,r_o);
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
}
|
||||
@@ -7,3 +7,4 @@ module load MPFR
|
||||
module load NVHPC
|
||||
module load UCX
|
||||
module load OpenMPI
|
||||
ulimit -c 0
|
||||
|
||||
Reference in New Issue
Block a user