mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
commit
b1c86900b2
@ -257,13 +257,11 @@ public:
|
|||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
tmp.Checkerboard() = !in.Checkerboard();
|
tmp.Checkerboard() = !in.Checkerboard();
|
||||||
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
|
|
||||||
|
|
||||||
_Mat.Meooe(in,tmp);
|
_Mat.Meooe(in,tmp);
|
||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(tmp,out);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(out,tmp);
|
||||||
|
|
||||||
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
|
|
||||||
_Mat.Mooee(in,out);
|
_Mat.Mooee(in,out);
|
||||||
return axpy_norm(out,-1.0,tmp,out);
|
return axpy_norm(out,-1.0,tmp,out);
|
||||||
}
|
}
|
||||||
@ -366,6 +364,9 @@ public:
|
|||||||
void OpDir(const Field& in, Field& out, int dir, int disp) {
|
void OpDir(const Field& in, Field& out, int dir, int disp) {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
void OpDirAll(const Field& in, std::vector<Field>& out){
|
||||||
|
assert(0);
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Matrix, class Field>
|
template<class Matrix, class Field>
|
||||||
|
@ -234,10 +234,8 @@ public:
|
|||||||
|
|
||||||
GridBase *grid=in.Grid();
|
GridBase *grid=in.Grid();
|
||||||
|
|
||||||
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
|
|
||||||
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
|
|
||||||
|
|
||||||
int vol=grid->gSites();
|
int vol=grid->gSites();
|
||||||
|
typedef typename Field::vector_type vector_type;
|
||||||
|
|
||||||
Field T0(grid); T0 = in;
|
Field T0(grid); T0 = in;
|
||||||
Field T1(grid);
|
Field T1(grid);
|
||||||
@ -260,12 +258,26 @@ public:
|
|||||||
for(int n=2;n<order;n++){
|
for(int n=2;n<order;n++){
|
||||||
|
|
||||||
Linop.HermOp(*Tn,y);
|
Linop.HermOp(*Tn,y);
|
||||||
// y=xscale*y+mscale*(*Tn);
|
#if 0
|
||||||
// *Tnp=2.0*y-(*Tnm);
|
auto y_v = y.View();
|
||||||
// out=out+Coeffs[n]* (*Tnp);
|
auto Tn_v = Tn->View();
|
||||||
|
auto Tnp_v = Tnp->View();
|
||||||
|
auto Tnm_v = Tnm->View();
|
||||||
|
constexpr int Nsimd = vector_type::Nsimd();
|
||||||
|
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
|
||||||
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
|
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||||
|
});
|
||||||
|
if ( Coeffs[n] != 0.0) {
|
||||||
|
axpy(out,Coeffs[n],*Tnp,out);
|
||||||
|
}
|
||||||
|
#else
|
||||||
axpby(y,xscale,mscale,y,(*Tn));
|
axpby(y,xscale,mscale,y,(*Tn));
|
||||||
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
|
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
|
||||||
|
if ( Coeffs[n] != 0.0) {
|
||||||
axpy(out,Coeffs[n],*Tnp,out);
|
axpy(out,Coeffs[n],*Tnp,out);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// Cycle pointers to avoid copies
|
// Cycle pointers to avoid copies
|
||||||
Field *swizzle = Tnm;
|
Field *swizzle = Tnm;
|
||||||
Tnm =Tn;
|
Tnm =Tn;
|
||||||
|
@ -6,21 +6,39 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||||
bool MemoryProfiler::debug = false;
|
bool MemoryProfiler::debug = false;
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
|
||||||
#define SMALL_LIMIT (0)
|
#ifdef GRID_CUDA
|
||||||
|
int PointerCache::Ncache = 32;
|
||||||
#else
|
#else
|
||||||
#define SMALL_LIMIT (4096)
|
int PointerCache::Ncache = 8;
|
||||||
#endif
|
#endif
|
||||||
|
int PointerCache::Victim;
|
||||||
|
int PointerCache::VictimSmall;
|
||||||
|
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
|
||||||
|
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
|
||||||
|
|
||||||
#ifdef POINTER_CACHE
|
void PointerCache::Init(void)
|
||||||
int PointerCache::victim;
|
{
|
||||||
|
char * str;
|
||||||
|
|
||||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
|
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||||
|
if ( str ) Ncache = atoi(str);
|
||||||
|
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
|
||||||
|
|
||||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
|
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||||
|
if ( str ) NcacheSmall = atoi(str);
|
||||||
if (bytes < SMALL_LIMIT ) return ptr;
|
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
|
||||||
|
|
||||||
|
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
|
||||||
|
}
|
||||||
|
void *PointerCache::Insert(void *ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||||
|
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
|
||||||
|
return Insert(ptr,bytes,Entries,Ncache,Victim);
|
||||||
|
}
|
||||||
|
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
|
||||||
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
assert(omp_in_parallel()==0);
|
assert(omp_in_parallel()==0);
|
||||||
#endif
|
#endif
|
||||||
@ -28,8 +46,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
|
|||||||
void * ret = NULL;
|
void * ret = NULL;
|
||||||
int v = -1;
|
int v = -1;
|
||||||
|
|
||||||
for(int e=0;e<Ncache;e++) {
|
for(int e=0;e<ncache;e++) {
|
||||||
if ( Entries[e].valid==0 ) {
|
if ( entries[e].valid==0 ) {
|
||||||
v=e;
|
v=e;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -37,40 +55,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
|
|||||||
|
|
||||||
if ( v==-1 ) {
|
if ( v==-1 ) {
|
||||||
v=victim;
|
v=victim;
|
||||||
victim = (victim+1)%Ncache;
|
victim = (victim+1)%ncache;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( Entries[v].valid ) {
|
if ( entries[v].valid ) {
|
||||||
ret = Entries[v].address;
|
ret = entries[v].address;
|
||||||
Entries[v].valid = 0;
|
entries[v].valid = 0;
|
||||||
Entries[v].address = NULL;
|
entries[v].address = NULL;
|
||||||
Entries[v].bytes = 0;
|
entries[v].bytes = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Entries[v].address=ptr;
|
entries[v].address=ptr;
|
||||||
Entries[v].bytes =bytes;
|
entries[v].bytes =bytes;
|
||||||
Entries[v].valid =1;
|
entries[v].valid =1;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *PointerCache::Lookup(size_t bytes) {
|
void *PointerCache::Lookup(size_t bytes)
|
||||||
|
{
|
||||||
if (bytes < SMALL_LIMIT ) return NULL;
|
if (bytes < GRID_ALLOC_SMALL_LIMIT )
|
||||||
|
return Lookup(bytes,EntriesSmall,NcacheSmall);
|
||||||
|
return Lookup(bytes,Entries,Ncache);
|
||||||
|
}
|
||||||
|
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
|
||||||
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
assert(omp_in_parallel()==0);
|
assert(omp_in_parallel()==0);
|
||||||
#endif
|
#endif
|
||||||
|
for(int e=0;e<ncache;e++){
|
||||||
for(int e=0;e<Ncache;e++){
|
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||||
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
|
entries[e].valid = 0;
|
||||||
Entries[e].valid = 0;
|
return entries[e].address;
|
||||||
return Entries[e].address;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||||
{
|
{
|
||||||
|
@ -42,21 +42,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define POINTER_CACHE
|
#define POINTER_CACHE
|
||||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||||
|
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
// Move control to configure.ac and Config.h?
|
// Move control to configure.ac and Config.h?
|
||||||
#ifdef POINTER_CACHE
|
|
||||||
class PointerCache {
|
class PointerCache {
|
||||||
private:
|
private:
|
||||||
/*Pinning pages is costly*/
|
/*Pinning pages is costly*/
|
||||||
/*Could maintain separate large and small allocation caches*/
|
/*Could maintain separate large and small allocation caches*/
|
||||||
#ifdef GRID_NVCC
|
/* Could make these configurable, perhaps up to a max size*/
|
||||||
static const int Ncache=128;
|
static const int NcacheSmallMax=128;
|
||||||
#else
|
static const int NcacheMax=16;
|
||||||
static const int Ncache=8;
|
static int NcacheSmall;
|
||||||
#endif
|
static int Ncache;
|
||||||
static int victim;
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
void *address;
|
void *address;
|
||||||
@ -64,15 +64,18 @@ private:
|
|||||||
int valid;
|
int valid;
|
||||||
} PointerCacheEntry;
|
} PointerCacheEntry;
|
||||||
|
|
||||||
static PointerCacheEntry Entries[Ncache];
|
static PointerCacheEntry Entries[NcacheMax];
|
||||||
|
static int Victim;
|
||||||
|
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
|
||||||
|
static int VictimSmall;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
static void Init(void);
|
||||||
static void *Insert(void *ptr,size_t bytes) ;
|
static void *Insert(void *ptr,size_t bytes) ;
|
||||||
|
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
|
||||||
static void *Lookup(size_t bytes) ;
|
static void *Lookup(size_t bytes) ;
|
||||||
|
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string sizeString(size_t bytes);
|
std::string sizeString(size_t bytes);
|
||||||
|
|
||||||
|
@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
|||||||
if (heap_bytes >= heap_size) {
|
if (heap_bytes >= heap_size) {
|
||||||
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
|
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
|
||||||
assert(heap_bytes<heap_size);
|
assert(heap_bytes<heap_size);
|
||||||
}
|
}
|
||||||
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
|
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
|
||||||
|
@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
auto lhs_v = lhs.View();
|
auto lhs_v = lhs.View();
|
||||||
auto ret_v = ret.View();
|
auto ret_v = ret.View();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
|
@ -110,15 +110,15 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
||||||
}
|
}
|
||||||
GridTime Elapsed(void) {
|
GridTime Elapsed(void) const {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
return std::chrono::duration_cast<GridTime>( accumulator );
|
return std::chrono::duration_cast<GridTime>( accumulator );
|
||||||
}
|
}
|
||||||
uint64_t useconds(void){
|
uint64_t useconds(void) const {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
return (uint64_t) accumulator.count();
|
return (uint64_t) accumulator.count();
|
||||||
}
|
}
|
||||||
bool isRunning(void){
|
bool isRunning(void) const {
|
||||||
return running;
|
return running;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
{
|
{
|
||||||
RealD eps = 1.0;
|
RealD eps = 1.0;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
|
// std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
|
||||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||||
assert(zdata->n==this->Ls);
|
assert(zdata->n==this->Ls);
|
||||||
|
|
||||||
|
@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
assert(mu>=0);
|
assert(mu>=0);
|
||||||
assert(mu<Nd);
|
assert(mu<Nd);
|
||||||
|
|
||||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// SHAMIR CASE
|
// SHAMIR CASE
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
@ -829,6 +829,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// GENERAL CAYLEY CASE
|
// GENERAL CAYLEY CASE
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
|
@ -159,6 +159,7 @@ const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
|
|||||||
Complex(-1),
|
Complex(-1),
|
||||||
Complex(-1)};
|
Complex(-1)};
|
||||||
|
|
||||||
|
//This is the old version
|
||||||
template <class FImpl>
|
template <class FImpl>
|
||||||
template <class mobj, class robj>
|
template <class mobj, class robj>
|
||||||
void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
|
void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
|
||||||
@ -180,6 +181,10 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
|
|||||||
auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
|
auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
|
||||||
auto gD3 = GammaB_right * D3;
|
auto gD3 = GammaB_right * D3;
|
||||||
|
|
||||||
|
auto D2g = D2 * GammaB_left;
|
||||||
|
auto pD1g = pD1 * GammaB_left;
|
||||||
|
auto gD3g = gD3 * GammaB_left;
|
||||||
|
|
||||||
for (int ie_left=0; ie_left < 6 ; ie_left++){
|
for (int ie_left=0; ie_left < 6 ; ie_left++){
|
||||||
int a_left = epsilon[ie_left][0]; //a
|
int a_left = epsilon[ie_left][0]; //a
|
||||||
int b_left = epsilon[ie_left][1]; //b
|
int b_left = epsilon[ie_left][1]; //b
|
||||||
@ -188,58 +193,71 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
|
|||||||
int a_right = epsilon[ie_right][0]; //a'
|
int a_right = epsilon[ie_right][0]; //a'
|
||||||
int b_right = epsilon[ie_right][1]; //b'
|
int b_right = epsilon[ie_right][1]; //b'
|
||||||
int c_right = epsilon[ie_right][2]; //c'
|
int c_right = epsilon[ie_right][2]; //c'
|
||||||
|
Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
|
||||||
//This is the \delta_{456}^{123} part
|
//This is the \delta_{456}^{123} part
|
||||||
if (wick_contraction[0]){
|
if (wick_contraction[0]){
|
||||||
auto D2g = D2 * GammaB_left;
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
|
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
|
||||||
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
|
auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
|
||||||
|
result()()() += eepD1*D2g_ab*gD3_ab;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
//This is the \delta_{456}^{231} part
|
//This is the \delta_{456}^{231} part
|
||||||
if (wick_contraction[1]){
|
if (wick_contraction[1]){
|
||||||
auto pD1g = pD1 * GammaB_left;
|
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
|
auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
|
||||||
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
|
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
|
||||||
|
auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
|
||||||
|
result()()() += eepD1g_gb*D2_ab*gD3_ag;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
//This is the \delta_{456}^{312} part
|
//This is the \delta_{456}^{312} part
|
||||||
if (wick_contraction[2]){
|
if (wick_contraction[2]){
|
||||||
auto gD3g = gD3 * GammaB_left;
|
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
|
auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
|
||||||
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
|
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
|
||||||
|
auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
|
||||||
|
result()()() += eepD1_gb*D2_ag*gD3g_ab;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
//This is the \delta_{456}^{132} part
|
//This is the \delta_{456}^{132} part
|
||||||
if (wick_contraction[3]){
|
if (wick_contraction[3]){
|
||||||
auto gD3g = gD3 * GammaB_left;
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
|
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
|
||||||
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
|
auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
|
||||||
|
result()()() -= eepD1*D2_ab*gD3g_ab;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
//This is the \delta_{456}^{321} part
|
//This is the \delta_{456}^{321} part
|
||||||
if (wick_contraction[4]){
|
if (wick_contraction[4]){
|
||||||
auto D2g = D2 * GammaB_left;
|
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
|
auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
|
||||||
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
|
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
|
||||||
|
auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
|
||||||
|
result()()() -= eepD1_gb*D2g_ab*gD3_ag;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
//This is the \delta_{456}^{213} part
|
//This is the \delta_{456}^{213} part
|
||||||
if (wick_contraction[5]){
|
if (wick_contraction[5]){
|
||||||
auto pD1g = pD1 * GammaB_left;
|
|
||||||
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
|
||||||
for (int beta_left=0; beta_left<Ns; beta_left++){
|
|
||||||
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
for (int gamma_left=0; gamma_left<Ns; gamma_left++){
|
||||||
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
|
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
|
||||||
|
auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
|
||||||
|
for (int beta_left=0; beta_left<Ns; beta_left++){
|
||||||
|
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
|
||||||
|
auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
|
||||||
|
result()()() -= eepD1g_gb*D2_ag*gD3_ab;
|
||||||
}}}
|
}}}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -259,6 +277,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
|
|||||||
const int parity,
|
const int parity,
|
||||||
ComplexField &baryon_corr)
|
ComplexField &baryon_corr)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
|
||||||
|
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
|
||||||
|
|
||||||
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
|
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
|
||||||
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
|
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
|
||||||
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
|
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
|
||||||
@ -305,6 +327,10 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
|
|||||||
const int parity,
|
const int parity,
|
||||||
robj &result)
|
robj &result)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
|
||||||
|
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
|
||||||
|
|
||||||
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
|
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
|
||||||
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
|
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
|
||||||
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
|
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
|
||||||
@ -318,7 +344,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
|
|||||||
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
|
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
|
||||||
|
|
||||||
result=Zero();
|
result=Zero();
|
||||||
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
|
baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/***********************************************************************
|
/***********************************************************************
|
||||||
@ -558,6 +584,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
|
|||||||
const std::string op,
|
const std::string op,
|
||||||
SpinMatrixField &stn_corr)
|
SpinMatrixField &stn_corr)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
|
||||||
|
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
|
||||||
|
|
||||||
GridBase *grid = qs_ti.Grid();
|
GridBase *grid = qs_ti.Grid();
|
||||||
|
|
||||||
auto vcorr= stn_corr.View();
|
auto vcorr= stn_corr.View();
|
||||||
@ -595,6 +625,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
|
|||||||
const std::string op,
|
const std::string op,
|
||||||
SpinMatrixField &stn_corr)
|
SpinMatrixField &stn_corr)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
|
||||||
|
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
|
||||||
|
|
||||||
GridBase *grid = qs_ti.Grid();
|
GridBase *grid = qs_ti.Grid();
|
||||||
|
|
||||||
auto vcorr= stn_corr.View();
|
auto vcorr= stn_corr.View();
|
||||||
|
@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
GridGpuInit(); // Must come first to set device prior to MPI init
|
GridGpuInit(); // Must come first to set device prior to MPI init
|
||||||
|
|
||||||
|
PointerCache::Init();
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
|
||||||
int MB;
|
int MB;
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||||
|
@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
|
|||||||
void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
|
void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
|
||||||
template<class VectorInt>
|
template<class VectorInt>
|
||||||
void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
|
void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
|
||||||
|
void GridCmdOptionInt(std::string &str,int & val);
|
||||||
|
|
||||||
|
|
||||||
void GridParseLayout(char **argv,int argc,
|
void GridParseLayout(char **argv,int argc,
|
||||||
|
@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> L_list;
|
std::vector<int> L_list;
|
||||||
std::vector<int> Ls_list;
|
std::vector<int> Ls_list;
|
||||||
std::vector<double> mflop_list;
|
std::vector<double> mflop_list;
|
||||||
@ -76,7 +75,6 @@ struct controls {
|
|||||||
int Opt;
|
int Opt;
|
||||||
int CommsOverlap;
|
int CommsOverlap;
|
||||||
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
|
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
|
||||||
// int HugePages;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class Benchmark {
|
class Benchmark {
|
||||||
@ -119,14 +117,15 @@ public:
|
|||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
comms_header();
|
comms_header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=16;lat<=maxlat;lat+=8){
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
// for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
{ int Ls=12;
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],
|
Coordinate latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
lat*mpi_layout[2],
|
lat*mpi_layout[2],
|
||||||
lat*mpi_layout[3]});
|
lat*mpi_layout[3]});
|
||||||
|
std::cout << GridLogMessage<< latt_size <<std::endl;
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
RealD Nrank = Grid._Nprocessors;
|
RealD Nrank = Grid._Nprocessors;
|
||||||
RealD Nnode = Grid.NodeCount();
|
RealD Nnode = Grid.NodeCount();
|
||||||
@ -184,9 +183,6 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
// for(int i=0;i<t_time.size();i++){
|
|
||||||
// std::cout << i<<" "<<t_time[i]<<std::endl;
|
|
||||||
// }
|
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
dbytes=dbytes*ppn;
|
||||||
double xbytes = dbytes*0.5;
|
double xbytes = dbytes*0.5;
|
||||||
@ -200,8 +196,6 @@ public:
|
|||||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,14 +221,15 @@ public:
|
|||||||
uint64_t NN;
|
uint64_t NN;
|
||||||
|
|
||||||
|
|
||||||
uint64_t lmax=48;
|
uint64_t lmax=32;
|
||||||
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
for(int lat=8;lat<=lmax;lat+=4){
|
for(int lat=8;lat<=lmax;lat+=8){
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
// NP= Grid.RankCount();
|
// NP= Grid.RankCount();
|
||||||
@ -270,191 +265,8 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if 0
|
|
||||||
static double DWF5(int Ls,int L)
|
|
||||||
{
|
|
||||||
// RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
|
|
||||||
double mflops;
|
static double DWF(int Ls,int L)
|
||||||
double mflops_best = 0;
|
|
||||||
double mflops_worst= 0;
|
|
||||||
std::vector<double> mflops_all;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////
|
|
||||||
// Set/Get the layout & grid size
|
|
||||||
///////////////////////////////////////////////////////
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
|
||||||
Coordinate local({L,L,L,L});
|
|
||||||
|
|
||||||
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}),
|
|
||||||
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
uint64_t NP = TmpGrid->RankCount();
|
|
||||||
uint64_t NN = TmpGrid->NodeCount();
|
|
||||||
NN_global=NN;
|
|
||||||
uint64_t SHM=NP/NN;
|
|
||||||
|
|
||||||
Coordinate internal;
|
|
||||||
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
|
|
||||||
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
|
|
||||||
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
|
|
||||||
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
|
|
||||||
else assert(0);
|
|
||||||
|
|
||||||
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
|
|
||||||
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
|
|
||||||
|
|
||||||
///////// Welcome message ////////////
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
///////// Lattice Init ////////////
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
///////// RNG Init ////////////
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
|
||||||
|
|
||||||
///////// Source preparation ////////////
|
|
||||||
LatticeFermion src (sFGrid);
|
|
||||||
LatticeFermion tmp (sFGrid);
|
|
||||||
std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
|
|
||||||
random(RNG5,src);
|
|
||||||
std::cout << GridLogMessage << "intialised random source" << std::endl;
|
|
||||||
|
|
||||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
|
||||||
src = src*N2;
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
|
||||||
|
|
||||||
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
|
|
||||||
LatticeFermion src_e (sFrbGrid);
|
|
||||||
LatticeFermion src_o (sFrbGrid);
|
|
||||||
LatticeFermion r_e (sFrbGrid);
|
|
||||||
LatticeFermion r_o (sFrbGrid);
|
|
||||||
LatticeFermion r_eo (sFGrid);
|
|
||||||
LatticeFermion err (sFGrid);
|
|
||||||
{
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
#if defined(AVX512)
|
|
||||||
const int num_cases = 6;
|
|
||||||
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
|
||||||
#else
|
|
||||||
const int num_cases = 4;
|
|
||||||
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
|
||||||
#endif
|
|
||||||
controls Cases [] = {
|
|
||||||
#ifdef AVX512
|
|
||||||
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
#endif
|
|
||||||
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
|
||||||
};
|
|
||||||
|
|
||||||
for(int c=0;c<num_cases;c++) {
|
|
||||||
|
|
||||||
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
|
|
||||||
WilsonKernelsStatic::Opt = Cases[c].Opt;
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
int nwarm = 100;
|
|
||||||
uint64_t ncall = 1000;
|
|
||||||
|
|
||||||
double t0=usecond();
|
|
||||||
sFGrid->Barrier();
|
|
||||||
for(int i=0;i<nwarm;i++){
|
|
||||||
sDw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
}
|
|
||||||
sFGrid->Barrier();
|
|
||||||
double t1=usecond();
|
|
||||||
|
|
||||||
sDw.ZeroCounters();
|
|
||||||
time_statistics timestat;
|
|
||||||
std::vector<double> t_time(ncall);
|
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
|
||||||
t0=usecond();
|
|
||||||
sDw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
t1=usecond();
|
|
||||||
t_time[i] = t1-t0;
|
|
||||||
}
|
|
||||||
sFGrid->Barrier();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=(1344.0*volume)/2;
|
|
||||||
double mf_hi, mf_lo, mf_err;
|
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
|
||||||
mf_hi = flops/timestat.min;
|
|
||||||
mf_lo = flops/timestat.max;
|
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
|
||||||
mflops_all.push_back(mflops);
|
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
|
|
||||||
|
|
||||||
sDw.Report();
|
|
||||||
|
|
||||||
}
|
|
||||||
double robust = mflops_worst/mflops_best;;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
|
|
||||||
std::cout<<GridLogMessage <<fmt << std::endl;
|
|
||||||
std::cout<<GridLogMessage;
|
|
||||||
|
|
||||||
for(int i=0;i<mflops_all.size();i++){
|
|
||||||
std::cout<<mflops_all[i]/NN<<" ; " ;
|
|
||||||
}
|
|
||||||
std::cout<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
return mflops_best;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static double DWF(int Ls,int L, double & robust)
|
|
||||||
{
|
{
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD M5 =1.8;
|
RealD M5 =1.8;
|
||||||
@ -471,37 +283,30 @@ public:
|
|||||||
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
||||||
Coordinate local({L,L,L,L});
|
Coordinate local({L,L,L,L});
|
||||||
|
|
||||||
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}),
|
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
|
||||||
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
uint64_t NP = TmpGrid->RankCount();
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
uint64_t NN = TmpGrid->NodeCount();
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
NN_global=NN;
|
NN_global=NN;
|
||||||
uint64_t SHM=NP/NN;
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
Coordinate internal;
|
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
|
||||||
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
|
|
||||||
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
|
|
||||||
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
|
|
||||||
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
|
|
||||||
else assert(0);
|
|
||||||
|
|
||||||
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
|
|
||||||
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
|
|
||||||
|
|
||||||
///////// Welcome message ////////////
|
///////// Welcome message ////////////
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
|
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
|
||||||
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
|
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
|
||||||
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
|
||||||
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
|
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
|
||||||
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
|
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
///////// Lattice Init ////////////
|
///////// Lattice Init ////////////
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
@ -514,74 +319,31 @@ public:
|
|||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
typedef DomainWallFermionF Action;
|
||||||
|
typedef typename Action::FermionField Fermion;
|
||||||
|
typedef LatticeGaugeFieldF Gauge;
|
||||||
|
|
||||||
///////// Source preparation ////////////
|
///////// Source preparation ////////////
|
||||||
LatticeFermion src (FGrid); random(RNG5,src);
|
Gauge Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
||||||
LatticeFermion ref (FGrid);
|
Fermion src (FGrid); random(RNG5,src);
|
||||||
LatticeFermion tmp (FGrid);
|
Fermion src_e (FrbGrid);
|
||||||
|
Fermion src_o (FrbGrid);
|
||||||
|
Fermion r_e (FrbGrid);
|
||||||
|
Fermion r_o (FrbGrid);
|
||||||
|
Fermion r_eo (FGrid);
|
||||||
|
Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
|
||||||
std::cout<<GridLogMessage << "Normalising src "<< N2 <<std::endl;
|
|
||||||
src = src*N2;
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
|
||||||
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
{
|
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
|
||||||
auto Umu_v = Umu.View();
|
|
||||||
auto Umu5d_v = Umu5d.View();
|
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
|
||||||
}
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
|
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
LatticeFermion src_e (FrbGrid);
|
|
||||||
LatticeFermion src_o (FrbGrid);
|
|
||||||
LatticeFermion r_e (FrbGrid);
|
|
||||||
LatticeFermion r_o (FrbGrid);
|
|
||||||
LatticeFermion r_eo (FGrid);
|
|
||||||
LatticeFermion err (FGrid);
|
|
||||||
{
|
{
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
pickCheckerboard(Odd,src_o,src);
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
#if defined(AVX512)
|
|
||||||
const int num_cases = 6;
|
|
||||||
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
|
||||||
#else
|
|
||||||
const int num_cases = 4;
|
const int num_cases = 4;
|
||||||
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||||
#endif
|
|
||||||
controls Cases [] = {
|
controls Cases [] = {
|
||||||
#ifdef AVX512
|
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
#endif
|
|
||||||
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
||||||
};
|
};
|
||||||
@ -594,15 +356,12 @@ public:
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
int nwarm = 200;
|
int nwarm = 10;
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
for(int i=0;i<nwarm;i++){
|
for(int i=0;i<nwarm;i++){
|
||||||
@ -610,9 +369,7 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
uint64_t ncall = 50;
|
||||||
// if (ncall < 500) ncall = 500;
|
|
||||||
uint64_t ncall = 1000;
|
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
@ -649,24 +406,11 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
|
||||||
Dw.Report();
|
|
||||||
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
|
||||||
setCheckerboard(r_eo,r_o);
|
|
||||||
setCheckerboard(r_eo,r_e);
|
|
||||||
err = r_eo-ref;
|
|
||||||
RealD absref = norm2(ref);
|
|
||||||
RealD abserr = norm2(err);
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< abserr << " / " << absref<<std::endl;
|
|
||||||
assert(abserr<1.0e-4);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
robust = mflops_worst/mflops_best;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
|
|
||||||
std::cout<<GridLogMessage <<fmt << std::endl;
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
std::cout<<GridLogMessage ;
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
@ -680,8 +424,166 @@ public:
|
|||||||
return mflops_best;
|
return mflops_best;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static double Staggered(int L)
|
||||||
|
{
|
||||||
|
double mflops;
|
||||||
|
double mflops_best = 0;
|
||||||
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Set/Get the layout & grid size
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
||||||
|
Coordinate local({L,L,L,L});
|
||||||
|
|
||||||
|
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||||
|
GridDefaultMpi());
|
||||||
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
|
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
|
||||||
|
|
||||||
|
///////// Welcome message ////////////
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
///////// Lattice Init ////////////
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
|
||||||
|
|
||||||
|
///////// RNG Init ////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
typedef ImprovedStaggeredFermionF Action;
|
||||||
|
typedef typename Action::FermionField Fermion;
|
||||||
|
typedef LatticeGaugeFieldF Gauge;
|
||||||
|
|
||||||
|
Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu);
|
||||||
|
|
||||||
|
typename Action::ImplParams params;
|
||||||
|
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
|
||||||
|
|
||||||
|
///////// Source preparation ////////////
|
||||||
|
Fermion src (FGrid); random(RNG4,src);
|
||||||
|
Fermion src_e (FrbGrid);
|
||||||
|
Fermion src_o (FrbGrid);
|
||||||
|
Fermion r_e (FrbGrid);
|
||||||
|
Fermion r_o (FrbGrid);
|
||||||
|
Fermion r_eo (FGrid);
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
const int num_cases = 4;
|
||||||
|
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||||
|
|
||||||
|
controls Cases [] = {
|
||||||
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
for(int c=0;c<num_cases;c++) {
|
||||||
|
|
||||||
|
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||||
|
StaggeredKernelsStatic::Opt = Cases[c].Opt;
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int nwarm = 10;
|
||||||
|
double t0=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
for(int i=0;i<nwarm;i++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
FGrid->Barrier();
|
||||||
|
double t1=usecond();
|
||||||
|
uint64_t ncall = 500;
|
||||||
|
|
||||||
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
Ds.ZeroCounters();
|
||||||
|
|
||||||
|
time_statistics timestat;
|
||||||
|
std::vector<double> t_time(ncall);
|
||||||
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
|
t0=usecond();
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
t1=usecond();
|
||||||
|
t_time[i] = t1-t0;
|
||||||
|
}
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1146.0*volume)/2;
|
||||||
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
|
timestat.statistics(t_time);
|
||||||
|
mf_hi = flops/timestat.min;
|
||||||
|
mf_lo = flops/timestat.max;
|
||||||
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
|
mflops = flops/timestat.mean;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
return mflops_best;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
@ -696,62 +598,50 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int do_memory=1;
|
int do_memory=1;
|
||||||
int do_comms =1;
|
int do_comms =1;
|
||||||
int do_su3 =0;
|
|
||||||
int do_wilson=1;
|
|
||||||
int do_dwf =1;
|
|
||||||
|
|
||||||
if ( do_su3 ) {
|
|
||||||
// empty for now
|
|
||||||
}
|
|
||||||
#if 1
|
|
||||||
int sel=2;
|
int sel=2;
|
||||||
Coordinate L_list({8,12,16,24});
|
std::vector<int> L_list({16,24,32});
|
||||||
#else
|
|
||||||
int sel=1;
|
|
||||||
Coordinate L_list({8,12});
|
|
||||||
#endif
|
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
std::vector<double> robust_list;
|
|
||||||
|
|
||||||
std::vector<double> wilson;
|
std::vector<double> wilson;
|
||||||
std::vector<double> dwf4;
|
std::vector<double> dwf4;
|
||||||
std::vector<double> dwf5;
|
std::vector<double> staggered;
|
||||||
|
|
||||||
if ( do_wilson ) {
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
double robust;
|
wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
|
||||||
wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ls=16;
|
Ls=12;
|
||||||
if ( do_dwf ) {
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
double robust;
|
double result = Benchmark::DWF(Ls,L_list[l]) ;
|
||||||
double result = Benchmark::DWF(Ls,L_list[l],robust) ;
|
|
||||||
dwf4.push_back(result);
|
dwf4.push_back(result);
|
||||||
robust_list.push_back(robust);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( do_dwf ) {
|
/*
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
double result = Benchmark::Staggered(L_list[l]) ;
|
||||||
|
staggered.push_back(result);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
|
std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
}
|
|
||||||
|
|
||||||
int NN=NN_global;
|
int NN=NN_global;
|
||||||
if ( do_memory ) {
|
if ( do_memory ) {
|
||||||
@ -768,7 +658,6 @@ int main (int argc, char ** argv)
|
|||||||
Benchmark::Comms();
|
Benchmark::Comms();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( do_dwf ) {
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -782,10 +671,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
|
||||||
std::cout<<std::setprecision(3);
|
std::cout<<std::setprecision(3);
|
||||||
std::cout<<GridLogMessage << " Comparison point robustness: " << robust_list[sel] <<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
176
benchmarks/Benchmark_schur.cc
Normal file
176
benchmarks/Benchmark_schur.cc
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & L, int Ls);
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
|
||||||
|
const int Ls=12;
|
||||||
|
std::vector< std::vector<int> > latts;
|
||||||
|
#if 1
|
||||||
|
latts.push_back(std::vector<int> ({24,24,24,24}) );
|
||||||
|
latts.push_back(std::vector<int> ({48,24,24,24}) );
|
||||||
|
latts.push_back(std::vector<int> ({96,24,24,24}) );
|
||||||
|
latts.push_back(std::vector<int> ({96,48,24,24}) );
|
||||||
|
// latts.push_back(std::vector<int> ({96,48,48,24}) );
|
||||||
|
// latts.push_back(std::vector<int> ({96,48,48,48}) );
|
||||||
|
#else
|
||||||
|
// latts.push_back(std::vector<int> ({96,48,48,48}) );
|
||||||
|
latts.push_back(std::vector<int> ({96,96,96,192}) );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
|
||||||
|
for (int l=0;l<latts.size();l++){
|
||||||
|
std::vector<int> latt4 = latts[l];
|
||||||
|
std::cout << GridLogMessage <<"\t";
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
std::cout<<latt4[d]<<"x";
|
||||||
|
}
|
||||||
|
std::cout <<Ls<<"\t" ;
|
||||||
|
benchDw (latt4,Ls);
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & latt4, int Ls)
|
||||||
|
{
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// for Nc=3
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Dw : Ls*24*(7+48)= Ls*1320
|
||||||
|
//
|
||||||
|
// M5D: Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
|
||||||
|
// Meo: Ls*24*(7+48) + Ls*72 = Ls*1392
|
||||||
|
//
|
||||||
|
// Mee: 3*Ns*2*Nc*Ls // Chroma 6*N5*Nc*Ns
|
||||||
|
//
|
||||||
|
// LeemInv : 2*2*Nc*madd*Ls
|
||||||
|
// LeeInv : 2*2*Nc*madd*Ls
|
||||||
|
// DeeInv : 4*2*Nc*mul *Ls
|
||||||
|
// UeeInv : 2*2*Nc*madd*Ls
|
||||||
|
// UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
|
||||||
|
// QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
|
||||||
|
// Mpc => 1452*cbvol*2*Ls flops //
|
||||||
|
// => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
// long unsigned int single_site_flops = 8*Nc*(7+16*Nc)*Ls;
|
||||||
|
long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
|
||||||
|
long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
|
||||||
|
ColourMatrixF cm = ComplexF(1.0,0.0);
|
||||||
|
|
||||||
|
int ncall=300;
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
|
||||||
|
LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
|
||||||
|
MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
|
||||||
|
|
||||||
|
LatticeFermionF src_o (FrbGrid); src_o=1.0;
|
||||||
|
LatticeFermionF r_o (FrbGrid); r_o=Zero();
|
||||||
|
|
||||||
|
int order =151;
|
||||||
|
SchurDiagOneOperator<MobiusFermionF,LatticeFermionF> Mpc(Dw);
|
||||||
|
Chebyshev<LatticeFermionF> Cheby(0.0,60.0,order);
|
||||||
|
|
||||||
|
{
|
||||||
|
Mpc.Mpc(src_o,r_o);
|
||||||
|
Mpc.Mpc(src_o,r_o);
|
||||||
|
Mpc.Mpc(src_o,r_o);
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Mpc.Mpc(src_o,r_o);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo so CB cancels.
|
||||||
|
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
|
||||||
|
flops=(single_site_quda_flops*volume*ncall);
|
||||||
|
std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
|
||||||
|
|
||||||
|
// Cheby uses MpcDagMpc so 2x flops
|
||||||
|
for(int i=0;i<1;i++){
|
||||||
|
Cheby(Mpc,src_o,r_o);
|
||||||
|
t0=usecond();
|
||||||
|
Cheby(Mpc,src_o,r_o);
|
||||||
|
t1=usecond();
|
||||||
|
flops=(single_site_mpc_flops*volume*2*order);
|
||||||
|
std::cout <<"\t"<<flops/(t1-t0);
|
||||||
|
flops=(single_site_quda_flops*volume*2*order);
|
||||||
|
std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
|
||||||
|
std::cout <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Dw.Report();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -88,25 +88,6 @@ int main (int argc, char ** argv)
|
|||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
}
|
}
|
||||||
ref = Zero();
|
ref = Zero();
|
||||||
/*
|
|
||||||
{ // Naive wilson implementation
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
|
|
||||||
tmp = U[mu]*Cshift(src,mu,1);
|
|
||||||
for(int i=0;i<ref._odata.size();i++){
|
|
||||||
ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
|
||||||
tmp =Cshift(tmp,mu,-1);
|
|
||||||
for(int i=0;i<ref._odata.size();i++){
|
|
||||||
ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
*/
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD c1=9.0/8.0;
|
RealD c1=9.0/8.0;
|
||||||
|
14
configure.ac
14
configure.ac
@ -274,12 +274,20 @@ case ${ac_gen_scalar} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
##################### Compiler dependent choices
|
##################### Compiler dependent choices
|
||||||
case ${CXX} in
|
|
||||||
|
#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
|
||||||
|
CXXBASE=${CXX}
|
||||||
|
CXXTEST=${CXX}
|
||||||
|
if echo "${CXX}" | grep -q "nvcc"; then
|
||||||
|
CXXTEST="nvcc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
case ${CXXTEST} in
|
||||||
nvcc)
|
nvcc)
|
||||||
# CXX="nvcc -keep -v -x cu "
|
# CXX="nvcc -keep -v -x cu "
|
||||||
# CXXLD="nvcc -v -link"
|
# CXXLD="nvcc -v -link"
|
||||||
CXX="nvcc -x cu "
|
CXX="${CXXBASE} -x cu "
|
||||||
CXXLD="nvcc -link"
|
CXXLD="${CXXBASE} -link"
|
||||||
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
|
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
|
||||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||||
if test $ac_openmp = yes; then
|
if test $ac_openmp = yes; then
|
||||||
|
Loading…
Reference in New Issue
Block a user