mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
commit
e2fc3a0f04
1
.gitignore
vendored
1
.gitignore
vendored
@ -88,6 +88,7 @@ Thumbs.db
|
||||
# build directory #
|
||||
###################
|
||||
build*/*
|
||||
Documentation/_build
|
||||
|
||||
# IDE related files #
|
||||
#####################
|
||||
|
@ -358,7 +358,7 @@ public:
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
auto& geom_v = geom;
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
@ -380,7 +380,7 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
for(int point=0;point<geom_v.npoint;point++){
|
||||
for(int point=0;point<npoint;point++){
|
||||
|
||||
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||
|
||||
@ -424,7 +424,7 @@ public:
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
auto& geom_v = geom;
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
@ -454,7 +454,7 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
for(int p=0;p<geom_v.npoint;p++){
|
||||
for(int p=0;p<npoint;p++){
|
||||
int point = points_p[p];
|
||||
|
||||
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||
|
@ -52,6 +52,7 @@ public:
|
||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
||||
virtual void HermOp(const Field &in, Field &out)=0;
|
||||
virtual ~LinearOperatorBase(){};
|
||||
};
|
||||
|
||||
|
||||
@ -507,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||
virtual void MpcDag (const Field &in, Field &out){
|
||||
Mpc(in,out);
|
||||
}
|
||||
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
||||
virtual void MpcDagMpc(const Field &in, Field &out) {
|
||||
assert(0);// Never need with staggered
|
||||
}
|
||||
};
|
||||
@ -530,6 +531,16 @@ public:
|
||||
template<class Field> class LinearFunction {
|
||||
public:
|
||||
virtual void operator() (const Field &in, Field &out) = 0;
|
||||
|
||||
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
|
||||
{
|
||||
assert(in.size() == out.size());
|
||||
|
||||
for (unsigned int i = 0; i < in.size(); ++i)
|
||||
{
|
||||
(*this)(in[i], out[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
|
||||
@ -575,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
|
||||
template<typename Field>
|
||||
class PlainHermOp : public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
LinearOperatorBase<Field> &_Linop;
|
||||
|
||||
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
|
||||
@ -588,6 +600,7 @@ public:
|
||||
template<typename Field>
|
||||
class FunctionHermOp : public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
OperatorFunction<Field> & _poly;
|
||||
LinearOperatorBase<Field> &_Linop;
|
||||
|
||||
|
@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Field> class Preconditioner : public LinearFunction<Field> {
|
||||
template<class Field> using Preconditioner = LinearFunction<Field> ;
|
||||
|
||||
/*
|
||||
template<class Field> class Preconditioner : public LinearFunction<Field> {
|
||||
using LinearFunction<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field & psi)=0;
|
||||
};
|
||||
*/
|
||||
|
||||
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
|
||||
public:
|
||||
void operator()(const Field &src, Field & psi){
|
||||
using Preconditioner<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field & psi){
|
||||
psi = src;
|
||||
}
|
||||
TrivialPrecon(void){};
|
||||
|
@ -48,6 +48,7 @@ public:
|
||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||
virtual ~SparseMatrixBase() {};
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -72,7 +73,7 @@ public:
|
||||
virtual void MeooeDag (const Field &in, Field &out)=0;
|
||||
virtual void MooeeDag (const Field &in, Field &out)=0;
|
||||
virtual void MooeeInvDag (const Field &in, Field &out)=0;
|
||||
|
||||
virtual ~CheckerBoardedSparseMatrixBase() {};
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
|
||||
{
|
||||
public:
|
||||
public:
|
||||
using LinearFunction<FieldD>::operator();
|
||||
RealD Tolerance;
|
||||
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
|
||||
Integer MaxInnerIterations;
|
||||
|
@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
|
||||
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
|
||||
public:
|
||||
public:
|
||||
using LinearFunction<FieldD>::operator();
|
||||
RealD Tolerance;
|
||||
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
|
||||
Integer MaxInnerIterations;
|
||||
|
@ -33,16 +33,19 @@ namespace Grid {
|
||||
template<class Field>
|
||||
class ZeroGuesser: public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
|
||||
};
|
||||
template<class Field>
|
||||
class DoNothingGuesser: public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field &guess) { };
|
||||
};
|
||||
template<class Field>
|
||||
class SourceGuesser: public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field &guess) { guess = src; };
|
||||
};
|
||||
|
||||
@ -54,15 +57,24 @@ class DeflatedGuesser: public LinearFunction<Field> {
|
||||
private:
|
||||
const std::vector<Field> &evec;
|
||||
const std::vector<RealD> &eval;
|
||||
const unsigned int N;
|
||||
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
|
||||
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
|
||||
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
|
||||
: DeflatedGuesser(_evec, _eval, _evec.size())
|
||||
{}
|
||||
|
||||
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
|
||||
: evec(_evec), eval(_eval), N(_N)
|
||||
{
|
||||
assert(evec.size()==eval.size());
|
||||
assert(N <= evec.size());
|
||||
}
|
||||
|
||||
virtual void operator()(const Field &src,Field &guess) {
|
||||
guess = Zero();
|
||||
assert(evec.size()==eval.size());
|
||||
auto N = evec.size();
|
||||
for (int i=0;i<N;i++) {
|
||||
const Field& tmp = evec[i];
|
||||
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
|
||||
@ -79,6 +91,7 @@ private:
|
||||
const std::vector<RealD> &eval_coarse;
|
||||
public:
|
||||
|
||||
using LinearFunction<FineField>::operator();
|
||||
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
|
||||
const std::vector<CoarseField> &_evec_coarse,
|
||||
const std::vector<RealD> &_eval_coarse)
|
||||
|
@ -67,6 +67,7 @@ public:
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
|
||||
public:
|
||||
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
|
||||
typedef iVector<CComplex,nbasis > CoarseSiteVector;
|
||||
typedef Lattice<CoarseSiteVector> CoarseField;
|
||||
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
|
||||
@ -97,6 +98,7 @@ public:
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
|
||||
public:
|
||||
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
|
||||
typedef iVector<CComplex,nbasis > CoarseSiteVector;
|
||||
typedef Lattice<CoarseSiteVector> CoarseField;
|
||||
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
|
||||
|
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class Field>
|
||||
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
|
||||
public:
|
||||
|
||||
using LinearFunction<Field>::operator();
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
int verbose;
|
||||
|
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class Field>
|
||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
|
||||
public:
|
||||
|
||||
using LinearFunction<Field>::operator();
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
int verbose;
|
||||
@ -119,7 +119,8 @@ public:
|
||||
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
|
||||
|
||||
RealD cp;
|
||||
ComplexD a, b, zAz;
|
||||
ComplexD a, b;
|
||||
// ComplexD zAz;
|
||||
RealD zAAz;
|
||||
ComplexD rq;
|
||||
|
||||
@ -146,7 +147,7 @@ public:
|
||||
//////////////////////////////////
|
||||
MatTimer.Start();
|
||||
Linop.Op(psi,Az);
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
MatTimer.Stop();
|
||||
|
||||
@ -170,7 +171,7 @@ public:
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
//p[0],q[0],qq[0]
|
||||
@ -212,7 +213,7 @@ public:
|
||||
MatTimer.Start();
|
||||
Linop.Op(z,Az);
|
||||
MatTimer.Stop();
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
@ -185,16 +185,19 @@ namespace Grid {
|
||||
////////////////////////////////////////////////
|
||||
if ( subGuess ) guess_save.resize(nblock,grid);
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
if(useSolnAsInitGuess) {
|
||||
|
||||
if(useSolnAsInitGuess) {
|
||||
for(int b=0;b<nblock;b++){
|
||||
pickCheckerboard(Odd, sol_o[b], out[b]);
|
||||
} else {
|
||||
guess(src_o[b],sol_o[b]);
|
||||
}
|
||||
} else {
|
||||
guess(src_o, sol_o);
|
||||
}
|
||||
|
||||
if ( subGuess ) {
|
||||
guess_save[b] = sol_o[b];
|
||||
}
|
||||
if ( subGuess ) {
|
||||
for(int b=0;b<nblock;b++){
|
||||
guess_save[b] = sol_o[b];
|
||||
}
|
||||
}
|
||||
//////////////////////////////////////////////////////////////
|
||||
// Call the block solver
|
||||
|
@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
|
||||
#define AccSmall (3)
|
||||
#define Shared (4)
|
||||
#define SharedSmall (5)
|
||||
#undef GRID_MM_VERBOSE
|
||||
uint64_t total_shared;
|
||||
uint64_t total_device;
|
||||
uint64_t total_host;;
|
||||
void MemoryManager::PrintBytes(void)
|
||||
{
|
||||
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
||||
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||
std::cout << " MemoryManager : PrintBytes "<<std::endl;
|
||||
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
|
||||
uint64_t cacheBytes;
|
||||
cacheBytes = CacheBytes[Cpu];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
|
||||
cacheBytes = CacheBytes[Acc];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
|
||||
cacheBytes = CacheBytes[Shared];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cuda_mem();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
||||
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
|
||||
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Actual allocation and deallocation utils
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
||||
{
|
||||
total_device+=bytes;
|
||||
void *ptr = (void *) Lookup(bytes,Acc);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||
total_device+=bytes;
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"AcceleratorAllocate "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
||||
{
|
||||
total_device-=bytes;
|
||||
void *__freeme = Insert(ptr,bytes,Acc);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeDevice(__freeme);
|
||||
total_device-=bytes;
|
||||
// PrintBytes();
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"AcceleratorFree "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
}
|
||||
void *MemoryManager::SharedAllocate(size_t bytes)
|
||||
{
|
||||
total_shared+=bytes;
|
||||
void *ptr = (void *) Lookup(bytes,Shared);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocShared(bytes);
|
||||
total_shared+=bytes;
|
||||
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
||||
// PrintBytes();
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"SharedAllocate "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
||||
{
|
||||
total_shared-=bytes;
|
||||
void *__freeme = Insert(ptr,bytes,Shared);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeShared(__freeme);
|
||||
total_shared-=bytes;
|
||||
// PrintBytes();
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"SharedFree "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
}
|
||||
#ifdef GRID_UVM
|
||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||
{
|
||||
total_host+=bytes;
|
||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocShared(bytes);
|
||||
total_host+=bytes;
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"CpuAllocate "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||
{
|
||||
total_host-=bytes;
|
||||
NotifyDeletion(_ptr);
|
||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeShared(__freeme);
|
||||
total_host-=bytes;
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"CpuFree "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||
{
|
||||
total_host+=bytes;
|
||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||
if ( ptr == (void *) NULL ) {
|
||||
ptr = (void *) acceleratorAllocCpu(bytes);
|
||||
total_host+=bytes;
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"CpuAllocate "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||
{
|
||||
total_host-=bytes;
|
||||
NotifyDeletion(_ptr);
|
||||
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||
if ( __freeme ) {
|
||||
acceleratorFreeCpu(__freeme);
|
||||
total_host-=bytes;
|
||||
}
|
||||
#ifdef GRID_MM_VERBOSE
|
||||
std::cout <<"CpuFree "<<std::endl;
|
||||
PrintBytes();
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -115,7 +159,6 @@ void MemoryManager::Init(void)
|
||||
|
||||
char * str;
|
||||
int Nc;
|
||||
int NcS;
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||
if ( str ) {
|
||||
@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type + small;
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
||||
|
||||
if ( entries[v].valid ) {
|
||||
ret = entries[v].address;
|
||||
cacheBytes -= entries[v].bytes;
|
||||
entries[v].valid = 0;
|
||||
entries[v].address = NULL;
|
||||
entries[v].bytes = 0;
|
||||
@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
||||
entries[v].address=ptr;
|
||||
entries[v].bytes =bytes;
|
||||
entries[v].valid =1;
|
||||
cacheBytes += bytes;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type+small;
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
|
||||
for(int e=0;e<ncache;e++){
|
||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||
entries[e].valid = 0;
|
||||
cacheBytes -= entries[e].bytes;
|
||||
return entries[e].address;
|
||||
}
|
||||
}
|
||||
|
@ -82,14 +82,15 @@ private:
|
||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||
static int Victim[NallocType];
|
||||
static int Ncache[NallocType];
|
||||
static uint64_t CacheBytes[NallocType];
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Free pool
|
||||
/////////////////////////////////////////////////
|
||||
static void *Insert(void *ptr,size_t bytes,int type) ;
|
||||
static void *Lookup(size_t bytes,int type) ;
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
|
||||
|
||||
static void PrintBytes(void);
|
||||
public:
|
||||
@ -169,6 +170,7 @@ private:
|
||||
|
||||
public:
|
||||
static void Print(void);
|
||||
static void PrintState( void* CpuPtr);
|
||||
static int isOpen (void* CpuPtr);
|
||||
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
|
@ -3,7 +3,7 @@
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
||||
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
||||
#define dprintf(...)
|
||||
|
||||
|
||||
@ -429,6 +429,7 @@ void MemoryManager::NotifyDeletion(void *_ptr)
|
||||
}
|
||||
void MemoryManager::Print(void)
|
||||
{
|
||||
PrintBytes();
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
@ -473,6 +474,32 @@ int MemoryManager::isOpen (void* _CpuPtr)
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::PrintState(void* _CpuPtr)
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
|
||||
if ( EntryPresent(CpuPtr) ){
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
std::string str;
|
||||
if ( AccCache.state==Empty ) str = std::string("Empty");
|
||||
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
|
||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
|
||||
|
||||
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t" << AccCache.cpuLock
|
||||
<< "\t" << AccCache.accLock
|
||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||
|
||||
} else {
|
||||
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
@ -16,6 +16,10 @@ uint64_t MemoryManager::DeviceToHostXfer;
|
||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
||||
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
||||
void MemoryManager::PrintState(void* CpuPtr)
|
||||
{
|
||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
|
||||
};
|
||||
void MemoryManager::Print(void){};
|
||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||
|
||||
|
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
bool Stencil_force_mpi = true;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -35,11 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef GRID_MPI3_SHM_NVLINK
|
||||
const bool Stencil_force_mpi = true;
|
||||
#else
|
||||
const bool Stencil_force_mpi = false;
|
||||
#endif
|
||||
extern bool Stencil_force_mpi ;
|
||||
|
||||
class CartesianCommunicator : public SharedMemory {
|
||||
|
||||
|
@ -384,6 +384,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=bytes;
|
||||
} else {
|
||||
// TODO : make a OMP loop on CPU, call threaded bcopy
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
|
||||
}
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||
@ -394,6 +400,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
// std::cout << "Copy Synchronised\n"<<std::endl;
|
||||
acceleratorCopySynchronise();
|
||||
|
||||
int nreq=list.size();
|
||||
|
||||
if (nreq==0) return;
|
||||
|
@ -513,26 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
||||
ze_device_mem_alloc_desc_t zeDesc = {};
|
||||
zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
|
||||
std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
#else
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
#endif
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
// if ( WorldRank == 0 ){
|
||||
if ( 1 ){
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
// SharedMemoryZero(ShmCommBuf,bytes);
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
std::cout<< "Setting up IPC"<<std::endl;
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
@ -543,21 +533,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
//////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
if(!Stencil_force_mpi) {
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
ze_ipc_mem_handle_t handle;
|
||||
typedef struct { int fd; pid_t pid ; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
||||
|
||||
ze_ipc_mem_handle_t ihandle;
|
||||
clone_mem_t handle;
|
||||
|
||||
if ( r==WorldShmRank ) {
|
||||
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle);
|
||||
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
}
|
||||
std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) ";
|
||||
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
|
||||
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
|
||||
}
|
||||
std::cerr<<std::endl;
|
||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||
handle.pid = getpid();
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
@ -580,6 +576,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
@ -595,22 +592,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////
|
||||
// If I am not the source, overwrite thisBuf with remote buffer
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * thisBuf = ShmCommBuf;
|
||||
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
if ( r!=WorldShmRank ) {
|
||||
thisBuf = nullptr;
|
||||
std::cerr<<"Using IpcHandle rank "<<r<<" ";
|
||||
for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
|
||||
std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
|
||||
}
|
||||
std::cerr<<std::endl;
|
||||
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf);
|
||||
std::cout<<"mapping seeking remote pid/fd "
|
||||
<<handle.pid<<"/"
|
||||
<<handle.fd<<std::endl;
|
||||
|
||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||
int myfd = syscall(438,pidfd,handle.fd,0);
|
||||
|
||||
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
|
||||
|
||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||
|
||||
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
|
||||
}
|
||||
assert(thisBuf!=nullptr);
|
||||
}
|
||||
@ -636,6 +642,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Save a copy of the device buffers
|
||||
///////////////////////////////////////////////////////////////
|
||||
}
|
||||
WorldShmCommBufs[r] = thisBuf;
|
||||
#else
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
|
@ -88,6 +88,13 @@ public:
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||
accessor.ViewClose();
|
||||
}
|
||||
|
||||
// Helper function to print the state of this object in the AccCache
|
||||
void PrintCacheState(void)
|
||||
{
|
||||
MemoryManager::PrintState(this->_odata);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// Return a view object that may be dereferenced in site loops.
|
||||
// The view is trivially copy constructible and may be copied to an accelerator device
|
||||
|
@ -42,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
|
||||
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
|
||||
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
|
||||
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
|
||||
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
|
||||
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
|
||||
|
||||
if (warpSize != WARP_SIZE) {
|
||||
@ -52,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
|
||||
|
||||
// let the number of threads in a block be a multiple of 2, starting from warpSize
|
||||
threads = warpSize;
|
||||
if ( threads*sizeofsobj > sharedMemPerBlock ) {
|
||||
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
|
||||
// keep all the streaming multiprocessors busy
|
||||
blocks = nextPow2(multiProcessorCount);
|
||||
|
@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
|
||||
{
|
||||
half.Checkerboard() = cb;
|
||||
autoView(half_v, half, AcceleratorWrite);
|
||||
autoView(full_v, full, AcceleratorRead);
|
||||
Coordinate rdim_full = full.Grid()->_rdimensions;
|
||||
Coordinate rdim_half = half.Grid()->_rdimensions;
|
||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||
Coordinate ostride_half = half.Grid()->_ostride;
|
||||
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||
|
||||
Coordinate coor;
|
||||
int cbos;
|
||||
int linear=0;
|
||||
|
||||
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
|
||||
assert(coor.size()==ndim_half);
|
||||
|
||||
for(int d=0;d<ndim_half;d++){
|
||||
if(checker_dim_mask_half[d]) linear += coor[d];
|
||||
}
|
||||
cbos = (linear&0x1);
|
||||
|
||||
if (cbos==cb) {
|
||||
int ssh=0;
|
||||
for(int d=0;d<ndim_half;d++) {
|
||||
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
|
||||
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
|
||||
}
|
||||
coalescedWrite(half_v[ssh],full_v(ss));
|
||||
}
|
||||
});
|
||||
}
|
||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
|
||||
{
|
||||
int cb = half.Checkerboard();
|
||||
autoView(half_v , half, AcceleratorRead);
|
||||
autoView(full_v , full, AcceleratorWrite);
|
||||
Coordinate rdim_full = full.Grid()->_rdimensions;
|
||||
Coordinate rdim_half = half.Grid()->_rdimensions;
|
||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||
Coordinate ostride_half = half.Grid()->_ostride;
|
||||
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||
|
||||
Coordinate coor;
|
||||
int cbos;
|
||||
int linear=0;
|
||||
|
||||
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
|
||||
assert(coor.size()==ndim_half);
|
||||
|
||||
for(int d=0;d<ndim_half;d++){
|
||||
if(checker_dim_mask_half[d]) linear += coor[d];
|
||||
}
|
||||
cbos = (linear&0x1);
|
||||
|
||||
if (cbos==cb) {
|
||||
int ssh=0;
|
||||
for(int d=0;d<ndim_half;d++){
|
||||
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
|
||||
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
|
||||
}
|
||||
coalescedWrite(full_v[ss],half_v(ssh));
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Flexible Type Conversion for internal promotion to double as well as graceful
|
||||
// treatment of scalar-compatible types
|
||||
|
@ -576,6 +576,8 @@ class ScidacReader : public GridLimeReader {
|
||||
std::string rec_name(ILDG_BINARY_DATA);
|
||||
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
|
||||
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
|
||||
// in principle should do the line below, but that breaks backard compatibility with old data
|
||||
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
|
||||
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
|
||||
return;
|
||||
}
|
||||
|
240
Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
Normal file
240
Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
Normal file
@ -0,0 +1,240 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
|
||||
|
||||
Copyright (C) 2020 - 2022
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
|
||||
//
|
||||
// Modifications done here:
|
||||
//
|
||||
// Original: clover term = 12x12 matrix per site
|
||||
//
|
||||
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
|
||||
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
|
||||
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
|
||||
//
|
||||
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
|
||||
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
|
||||
//
|
||||
// Words per site and improvement compared to original (combined with the input and output spinors):
|
||||
//
|
||||
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
|
||||
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
|
||||
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
|
||||
//
|
||||
// These improvements directly translate to wall-clock time
|
||||
//
|
||||
// Data layout:
|
||||
//
|
||||
// - diagonal and triangle part as separate lattice fields,
|
||||
// this was faster than as 1 combined field on all tested machines
|
||||
// - diagonal: as expected
|
||||
// - triangle: store upper right triangle in row major order
|
||||
// - graphical:
|
||||
// 0 1 2 3 4
|
||||
// 5 6 7 8
|
||||
// 9 10 11 = upper right triangle indices
|
||||
// 12 13
|
||||
// 14
|
||||
// 0
|
||||
// 1
|
||||
// 2
|
||||
// 3 = diagonal indices
|
||||
// 4
|
||||
// 5
|
||||
// 0
|
||||
// 1 5
|
||||
// 2 6 9 = lower left triangle indices
|
||||
// 3 7 10 12
|
||||
// 4 8 11 13 14
|
||||
//
|
||||
// Impact on total memory consumption:
|
||||
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
|
||||
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
|
||||
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
|
||||
// = 84 complex words per site
|
||||
|
||||
template<class Impl>
|
||||
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
|
||||
public WilsonCloverHelpers<Impl>,
|
||||
public CompactWilsonCloverHelpers<Impl> {
|
||||
/////////////////////////////////////////////
|
||||
// Sizes
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Type definitions
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||
|
||||
typedef WilsonFermion<Impl> WilsonBase;
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Constructors
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
CompactWilsonCloverFermion(GaugeField& _Umu,
|
||||
GridCartesian& Fgrid,
|
||||
GridRedBlackCartesian& Hgrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const RealD _cF = 1.0,
|
||||
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
|
||||
const ImplParams& impl_p = ImplParams());
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (implementing interface)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
virtual void Instantiatable() {};
|
||||
int ConstEE() override { return 0; };
|
||||
int isTrivialEE() override { return 0; };
|
||||
|
||||
void Dhop(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
|
||||
|
||||
void M(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Meooe(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MeooeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mooee(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInv(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInvDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
|
||||
|
||||
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
|
||||
|
||||
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (internals)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void ImportGauge(const GaugeField& _Umu) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
private:
|
||||
|
||||
template<class Field>
|
||||
const MaskField* getCorrectMaskField(const Field &in) const {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
return &this->BoundaryMaskOdd;
|
||||
} else {
|
||||
return &this->BoundaryMaskEven;
|
||||
}
|
||||
} else {
|
||||
return &this->BoundaryMask;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
void ApplyBoundaryMask(Field& f) {
|
||||
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
|
||||
assert(m != nullptr);
|
||||
CompactHelpers::ApplyBoundaryMask(f, *m);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member Data
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
RealD csw_r;
|
||||
RealD csw_t;
|
||||
RealD cF;
|
||||
|
||||
bool open_boundaries;
|
||||
|
||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||
|
||||
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
|
||||
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
|
||||
|
||||
FermionField Tmp;
|
||||
|
||||
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
|
||||
NAMESPACE_CHECK(WilsonTM);
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
||||
NAMESPACE_CHECK(WilsonClover);
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
NAMESPACE_CHECK(Wilson5D);
|
||||
@ -153,6 +154,23 @@ typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoInd
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
|
||||
|
||||
// Compact Clover fermions
|
||||
typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
|
||||
typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
|
||||
typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
|
||||
|
||||
typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
|
||||
typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
|
||||
typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
|
||||
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
|
||||
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
|
||||
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
|
||||
|
||||
// Domain Wall fermions
|
||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
||||
|
@ -4,10 +4,11 @@
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
|
||||
|
||||
Copyright (C) 2017
|
||||
Copyright (C) 2017 - 2022
|
||||
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: David Preti <>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -29,7 +30,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid);
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
class WilsonCloverFermion : public WilsonFermion<Impl>
|
||||
class WilsonCloverFermion : public WilsonFermion<Impl>,
|
||||
public WilsonCloverHelpers<Impl>
|
||||
{
|
||||
public:
|
||||
// Types definitions
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
template <typename vtype>
|
||||
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
||||
typedef iImplClover<Simd> SiteCloverType;
|
||||
typedef Lattice<SiteCloverType> CloverFieldType;
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
|
||||
public:
|
||||
typedef WilsonFermion<Impl> WilsonBase;
|
||||
typedef WilsonFermion<Impl> WilsonBase;
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
|
||||
virtual int ConstEE(void) { return 0; };
|
||||
virtual void Instantiatable(void){};
|
||||
@ -72,42 +71,7 @@ public:
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
|
||||
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
|
||||
Fgrid,
|
||||
Hgrid,
|
||||
_mass, impl_p, clover_anisotropy),
|
||||
CloverTerm(&Fgrid),
|
||||
CloverTermInv(&Fgrid),
|
||||
CloverTermEven(&Hgrid),
|
||||
CloverTermOdd(&Hgrid),
|
||||
CloverTermInvEven(&Hgrid),
|
||||
CloverTermInvOdd(&Hgrid),
|
||||
CloverTermDagEven(&Hgrid),
|
||||
CloverTermDagOdd(&Hgrid),
|
||||
CloverTermInvDagEven(&Hgrid),
|
||||
CloverTermInvDagOdd(&Hgrid)
|
||||
{
|
||||
assert(Nd == 4); // require 4 dimensions
|
||||
|
||||
if (clover_anisotropy.isAnisotropic)
|
||||
{
|
||||
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
|
||||
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
|
||||
}
|
||||
else
|
||||
{
|
||||
csw_r = _csw_r * 0.5;
|
||||
diag_mass = 4.0 + _mass;
|
||||
}
|
||||
csw_t = _csw_t * 0.5;
|
||||
|
||||
if (csw_r == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
|
||||
if (csw_t == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
}
|
||||
const ImplParams &impl_p = ImplParams());
|
||||
|
||||
virtual void M(const FermionField &in, FermionField &out);
|
||||
virtual void Mdag(const FermionField &in, FermionField &out);
|
||||
@ -124,250 +88,21 @@ public:
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
|
||||
// Derivative parts unpreconditioned pseudofermions
|
||||
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
conformable(X.Grid(), force.Grid());
|
||||
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||
GaugeField clover_force(force.Grid());
|
||||
PropagatorField Lambda(force.Grid());
|
||||
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = Zero();
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = Zero();
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = Zero();
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
|
||||
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
|
||||
{
|
||||
conformable(lambda.Grid(), U[0].Grid());
|
||||
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
|
||||
// insertion in upper staple
|
||||
// please check redundancy of shift operations
|
||||
|
||||
// C1+
|
||||
tmp = lambda * U[nu];
|
||||
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C2+
|
||||
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C3+
|
||||
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
|
||||
|
||||
// C4+
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
|
||||
|
||||
// insertion in lower staple
|
||||
// C1-
|
||||
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C2-
|
||||
tmp = adj(lambda) * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C3-
|
||||
tmp = lambda * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
|
||||
|
||||
// C4-
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
protected:
|
||||
public:
|
||||
// here fixing the 4 dimensions, make it more general?
|
||||
|
||||
RealD csw_r; // Clover coefficient - spatial
|
||||
RealD csw_t; // Clover coefficient - temporal
|
||||
RealD diag_mass; // Mass term
|
||||
CloverFieldType CloverTerm, CloverTermInv; // Clover term
|
||||
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
|
||||
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
|
||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||
|
||||
public:
|
||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||
// using the DeGrand-Rossi basis for the gamma matrices
|
||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView(T_v, T,AcceleratorWrite);
|
||||
autoView(F_v, F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = -F_v[i]()();
|
||||
T_v[i]()(1, 0) = F_v[i]()();
|
||||
T_v[i]()(2, 3) = -F_v[i]()();
|
||||
T_v[i]()(3, 2) = F_v[i]()();
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXY(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(3, 3) = timesI(F_v[i]()());
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v , T, AcceleratorWrite);
|
||||
autoView( F_v , F, AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverYT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v ,T,AcceleratorWrite);
|
||||
autoView( F_v ,F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||
T_v[i]()(1, 0) = (F_v[i]()());
|
||||
T_v[i]()(2, 3) = (F_v[i]()());
|
||||
T_v[i]()(3, 2) = -(F_v[i]()());
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverZT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F.Grid());
|
||||
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v , T,AcceleratorWrite);
|
||||
autoView( F_v , F,AcceleratorRead);
|
||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||
{
|
||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
|
||||
T_v[i]()(3, 3) = timesI(F_v[i]()());
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
CloverField CloverTerm, CloverTermInv; // Clover term
|
||||
CloverField CloverTermEven, CloverTermOdd; // Clover term EO
|
||||
CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
|
||||
CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||
CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
761
Grid/qcd/action/fermion/WilsonCloverHelpers.h
Normal file
761
Grid/qcd/action/fermion/WilsonCloverHelpers.h
Normal file
@ -0,0 +1,761 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
|
||||
|
||||
Copyright (C) 2021 - 2022
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
// Helper routines that implement common clover functionality
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> class WilsonCloverHelpers {
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
|
||||
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
|
||||
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
|
||||
{
|
||||
conformable(lambda.Grid(), U[0].Grid());
|
||||
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
|
||||
// insertion in upper staple
|
||||
// please check redundancy of shift operations
|
||||
|
||||
// C1+
|
||||
tmp = lambda * U[nu];
|
||||
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C2+
|
||||
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C3+
|
||||
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
|
||||
|
||||
// C4+
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
|
||||
|
||||
// insertion in lower staple
|
||||
// C1-
|
||||
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C2-
|
||||
tmp = adj(lambda) * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C3-
|
||||
tmp = lambda * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
|
||||
|
||||
// C4-
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static CloverField fillCloverYZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
T = Zero();
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
static CloverField fillCloverXZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView(T_v, T,AcceleratorWrite);
|
||||
autoView(F_v, F,AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
|
||||
coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
|
||||
coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
|
||||
coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
static CloverField fillCloverXY(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView(T_v,T,AcceleratorWrite);
|
||||
autoView(F_v,F,AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
static CloverField fillCloverXT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v , T, AcceleratorWrite);
|
||||
autoView( F_v , F, AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
static CloverField fillCloverYT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v ,T,AcceleratorWrite);
|
||||
autoView( F_v ,F,AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
static CloverField fillCloverZT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverField T(F.Grid());
|
||||
|
||||
T = Zero();
|
||||
|
||||
autoView( T_v , T,AcceleratorWrite);
|
||||
autoView( F_v , F,AcceleratorRead);
|
||||
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
|
||||
{
|
||||
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
|
||||
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
|
||||
});
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
|
||||
auto CC = coalescedRead(C);
|
||||
mult(&phi, &CC, &chi);
|
||||
}
|
||||
|
||||
template<class _SpinorField>
|
||||
inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
|
||||
const int Nsimd = SiteSpinor::Nsimd();
|
||||
autoView(out_v, out, AcceleratorWrite);
|
||||
autoView(phi_v, phi, AcceleratorRead);
|
||||
autoView(C_v, C, AcceleratorRead);
|
||||
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
||||
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
||||
calcSpinor tmp;
|
||||
multClover(tmp,C_v[sss],phi_v(sss));
|
||||
coalescedWrite(out_v[sss],tmp);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class Impl> class CompactWilsonCloverHelpers {
|
||||
public:
|
||||
|
||||
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||
|
||||
#if 0
|
||||
static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
|
||||
assert(i != j);
|
||||
if(i < j) {
|
||||
return triangle()(block)(triangle_index(i, j));
|
||||
} else { // i > j
|
||||
return conjugate(triangle()(block)(triangle_index(i, j)));
|
||||
}
|
||||
}
|
||||
#else
|
||||
template<typename vobj>
|
||||
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
|
||||
assert(i != j);
|
||||
if(i < j) {
|
||||
return triangle()(block)(triangle_index(i, j));
|
||||
} else { // i > j
|
||||
return conjugate(triangle()(block)(triangle_index(i, j)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static accelerator_inline int triangle_index(int i, int j) {
|
||||
if(i == j)
|
||||
return 0;
|
||||
else if(i < j)
|
||||
return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
|
||||
else // i > j
|
||||
return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
|
||||
}
|
||||
|
||||
static void MooeeKernel_gpu(int Nsite,
|
||||
int Ls,
|
||||
const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
autoView(diagonal_v, diagonal, AcceleratorRead);
|
||||
autoView(triangle_v, triangle, AcceleratorRead);
|
||||
autoView(in_v, in, AcceleratorRead);
|
||||
autoView(out_v, out, AcceleratorWrite);
|
||||
|
||||
typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
|
||||
|
||||
const uint64_t NN = Nsite * Ls;
|
||||
|
||||
accelerator_for(ss, NN, Simd::Nsimd(), {
|
||||
int sF = ss;
|
||||
int sU = ss/Ls;
|
||||
CalcSpinor res;
|
||||
CalcSpinor in_t = in_v(sF);
|
||||
auto diagonal_t = diagonal_v(sU);
|
||||
auto triangle_t = triangle_v(sU);
|
||||
for(int block=0; block<Nhs; block++) {
|
||||
int s_start = block*Nhs;
|
||||
for(int i=0; i<Nred; i++) {
|
||||
int si = s_start + i/Nc, ci = i%Nc;
|
||||
res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
|
||||
for(int j=0; j<Nred; j++) {
|
||||
if (j == i) continue;
|
||||
int sj = s_start + j/Nc, cj = j%Nc;
|
||||
res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
|
||||
};
|
||||
};
|
||||
};
|
||||
coalescedWrite(out_v[sF], res);
|
||||
});
|
||||
}
|
||||
|
||||
static void MooeeKernel_cpu(int Nsite,
|
||||
int Ls,
|
||||
const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
autoView(diagonal_v, diagonal, CpuRead);
|
||||
autoView(triangle_v, triangle, CpuRead);
|
||||
autoView(in_v, in, CpuRead);
|
||||
autoView(out_v, out, CpuWrite);
|
||||
|
||||
typedef SiteSpinor CalcSpinor;
|
||||
|
||||
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
|
||||
#define PREFETCH_CLOVER(BASE) { \
|
||||
uint64_t base; \
|
||||
int pf_dist_L1 = 1; \
|
||||
int pf_dist_L2 = -5; /* -> penalty -> disable */ \
|
||||
\
|
||||
if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
|
||||
base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
|
||||
} \
|
||||
\
|
||||
if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
|
||||
base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
|
||||
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
|
||||
} \
|
||||
}
|
||||
// TODO: Implement/generalize this for other architectures
|
||||
// I played around a bit on KNL (see below) but didn't bring anything
|
||||
// #elif defined(AVX512)
|
||||
// #define PREFETCH_CLOVER(BASE) { \
|
||||
// uint64_t base; \
|
||||
// int pf_dist_L1 = 1; \
|
||||
// int pf_dist_L2 = +4; \
|
||||
// \
|
||||
// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
|
||||
// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
|
||||
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
|
||||
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
|
||||
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
|
||||
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
|
||||
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
|
||||
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
|
||||
// } \
|
||||
// \
|
||||
// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
|
||||
// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
|
||||
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
|
||||
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
|
||||
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
|
||||
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
|
||||
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
|
||||
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
|
||||
// } \
|
||||
// }
|
||||
#else
|
||||
#define PREFETCH_CLOVER(BASE)
|
||||
#endif
|
||||
|
||||
const uint64_t NN = Nsite * Ls;
|
||||
|
||||
thread_for(ss, NN, {
|
||||
int sF = ss;
|
||||
int sU = ss/Ls;
|
||||
CalcSpinor res;
|
||||
CalcSpinor in_t = in_v[sF];
|
||||
auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
|
||||
auto triangle_t = triangle_v[sU];
|
||||
|
||||
// upper half
|
||||
PREFETCH_CLOVER(0);
|
||||
|
||||
auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
|
||||
auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
|
||||
auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
|
||||
auto in_cc_1_0 = conjugate(in_t()(1)(0));
|
||||
auto in_cc_1_1 = conjugate(in_t()(1)(1));
|
||||
|
||||
res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
|
||||
+ triangle_t()(0)( 0) * in_t()(0)(1)
|
||||
+ triangle_t()(0)( 1) * in_t()(0)(2)
|
||||
+ triangle_t()(0)( 2) * in_t()(1)(0)
|
||||
+ triangle_t()(0)( 3) * in_t()(1)(1)
|
||||
+ triangle_t()(0)( 4) * in_t()(1)(2);
|
||||
|
||||
res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
|
||||
res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
|
||||
+ triangle_t()(0)( 5) * in_t()(0)(2)
|
||||
+ triangle_t()(0)( 6) * in_t()(1)(0)
|
||||
+ triangle_t()(0)( 7) * in_t()(1)(1)
|
||||
+ triangle_t()(0)( 8) * in_t()(1)(2)
|
||||
+ conjugate( res()(0)( 1));
|
||||
|
||||
res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
|
||||
+ triangle_t()(0)( 5) * in_cc_0_1;
|
||||
res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
|
||||
+ triangle_t()(0)( 9) * in_t()(1)(0)
|
||||
+ triangle_t()(0)(10) * in_t()(1)(1)
|
||||
+ triangle_t()(0)(11) * in_t()(1)(2)
|
||||
+ conjugate( res()(0)( 2));
|
||||
|
||||
res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
|
||||
+ triangle_t()(0)( 6) * in_cc_0_1
|
||||
+ triangle_t()(0)( 9) * in_cc_0_2;
|
||||
res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
|
||||
+ triangle_t()(0)(12) * in_t()(1)(1)
|
||||
+ triangle_t()(0)(13) * in_t()(1)(2)
|
||||
+ conjugate( res()(1)( 0));
|
||||
|
||||
res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
|
||||
+ triangle_t()(0)( 7) * in_cc_0_1
|
||||
+ triangle_t()(0)(10) * in_cc_0_2
|
||||
+ triangle_t()(0)(12) * in_cc_1_0;
|
||||
res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
|
||||
+ triangle_t()(0)(14) * in_t()(1)(2)
|
||||
+ conjugate( res()(1)( 1));
|
||||
|
||||
res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
|
||||
+ triangle_t()(0)( 8) * in_cc_0_1
|
||||
+ triangle_t()(0)(11) * in_cc_0_2
|
||||
+ triangle_t()(0)(13) * in_cc_1_0
|
||||
+ triangle_t()(0)(14) * in_cc_1_1;
|
||||
res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
|
||||
+ conjugate( res()(1)( 2));
|
||||
|
||||
vstream(out_v[sF]()(0)(0), res()(0)(0));
|
||||
vstream(out_v[sF]()(0)(1), res()(0)(1));
|
||||
vstream(out_v[sF]()(0)(2), res()(0)(2));
|
||||
vstream(out_v[sF]()(1)(0), res()(1)(0));
|
||||
vstream(out_v[sF]()(1)(1), res()(1)(1));
|
||||
vstream(out_v[sF]()(1)(2), res()(1)(2));
|
||||
|
||||
// lower half
|
||||
PREFETCH_CLOVER(1);
|
||||
|
||||
auto in_cc_2_0 = conjugate(in_t()(2)(0));
|
||||
auto in_cc_2_1 = conjugate(in_t()(2)(1));
|
||||
auto in_cc_2_2 = conjugate(in_t()(2)(2));
|
||||
auto in_cc_3_0 = conjugate(in_t()(3)(0));
|
||||
auto in_cc_3_1 = conjugate(in_t()(3)(1));
|
||||
|
||||
res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
|
||||
+ triangle_t()(1)( 0) * in_t()(2)(1)
|
||||
+ triangle_t()(1)( 1) * in_t()(2)(2)
|
||||
+ triangle_t()(1)( 2) * in_t()(3)(0)
|
||||
+ triangle_t()(1)( 3) * in_t()(3)(1)
|
||||
+ triangle_t()(1)( 4) * in_t()(3)(2);
|
||||
|
||||
res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
|
||||
res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
|
||||
+ triangle_t()(1)( 5) * in_t()(2)(2)
|
||||
+ triangle_t()(1)( 6) * in_t()(3)(0)
|
||||
+ triangle_t()(1)( 7) * in_t()(3)(1)
|
||||
+ triangle_t()(1)( 8) * in_t()(3)(2)
|
||||
+ conjugate( res()(2)( 1));
|
||||
|
||||
res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
|
||||
+ triangle_t()(1)( 5) * in_cc_2_1;
|
||||
res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
|
||||
+ triangle_t()(1)( 9) * in_t()(3)(0)
|
||||
+ triangle_t()(1)(10) * in_t()(3)(1)
|
||||
+ triangle_t()(1)(11) * in_t()(3)(2)
|
||||
+ conjugate( res()(2)( 2));
|
||||
|
||||
res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
|
||||
+ triangle_t()(1)( 6) * in_cc_2_1
|
||||
+ triangle_t()(1)( 9) * in_cc_2_2;
|
||||
res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
|
||||
+ triangle_t()(1)(12) * in_t()(3)(1)
|
||||
+ triangle_t()(1)(13) * in_t()(3)(2)
|
||||
+ conjugate( res()(3)( 0));
|
||||
|
||||
res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
|
||||
+ triangle_t()(1)( 7) * in_cc_2_1
|
||||
+ triangle_t()(1)(10) * in_cc_2_2
|
||||
+ triangle_t()(1)(12) * in_cc_3_0;
|
||||
res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
|
||||
+ triangle_t()(1)(14) * in_t()(3)(2)
|
||||
+ conjugate( res()(3)( 1));
|
||||
|
||||
res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
|
||||
+ triangle_t()(1)( 8) * in_cc_2_1
|
||||
+ triangle_t()(1)(11) * in_cc_2_2
|
||||
+ triangle_t()(1)(13) * in_cc_3_0
|
||||
+ triangle_t()(1)(14) * in_cc_3_1;
|
||||
res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
|
||||
+ conjugate( res()(3)( 2));
|
||||
|
||||
vstream(out_v[sF]()(2)(0), res()(2)(0));
|
||||
vstream(out_v[sF]()(2)(1), res()(2)(1));
|
||||
vstream(out_v[sF]()(2)(2), res()(2)(2));
|
||||
vstream(out_v[sF]()(3)(0), res()(3)(0));
|
||||
vstream(out_v[sF]()(3)(1), res()(3)(1));
|
||||
vstream(out_v[sF]()(3)(2), res()(3)(2));
|
||||
});
|
||||
}
|
||||
|
||||
static void MooeeKernel(int Nsite,
|
||||
int Ls,
|
||||
const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
|
||||
#else
|
||||
MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void Invert(const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle,
|
||||
CloverDiagonalField& diagonalInv,
|
||||
CloverTriangleField& triangleInv) {
|
||||
conformable(diagonal, diagonalInv);
|
||||
conformable(triangle, triangleInv);
|
||||
conformable(diagonal, triangle);
|
||||
|
||||
diagonalInv.Checkerboard() = diagonal.Checkerboard();
|
||||
triangleInv.Checkerboard() = triangle.Checkerboard();
|
||||
|
||||
GridBase* grid = diagonal.Grid();
|
||||
|
||||
long lsites = grid->lSites();
|
||||
|
||||
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
|
||||
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
|
||||
|
||||
autoView(diagonal_v, diagonal, CpuRead);
|
||||
autoView(triangle_v, triangle, CpuRead);
|
||||
autoView(diagonalInv_v, diagonalInv, CpuWrite);
|
||||
autoView(triangleInv_v, triangleInv, CpuWrite);
|
||||
|
||||
thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
|
||||
Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
|
||||
Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
|
||||
|
||||
scalar_object_diagonal diagonal_tmp = Zero();
|
||||
scalar_object_diagonal diagonal_inv_tmp = Zero();
|
||||
scalar_object_triangle triangle_tmp = Zero();
|
||||
scalar_object_triangle triangle_inv_tmp = Zero();
|
||||
|
||||
Coordinate lcoor;
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
|
||||
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
|
||||
peekLocalSite(triangle_tmp, triangle_v, lcoor);
|
||||
|
||||
// TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
|
||||
for (long s_row=0;s_row<Ns;s_row++) {
|
||||
for (long s_col=0;s_col<Ns;s_col++) {
|
||||
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
|
||||
int block = s_row / Nhs;
|
||||
int s_row_block = s_row % Nhs;
|
||||
int s_col_block = s_col % Nhs;
|
||||
for (long c_row=0;c_row<Nc;c_row++) {
|
||||
for (long c_col=0;c_col<Nc;c_col++) {
|
||||
int i = s_row_block * Nc + c_row;
|
||||
int j = s_col_block * Nc + c_col;
|
||||
if(i == j)
|
||||
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
|
||||
else
|
||||
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clover_inv_eigen = clover_eigen.inverse();
|
||||
|
||||
for (long s_row=0;s_row<Ns;s_row++) {
|
||||
for (long s_col=0;s_col<Ns;s_col++) {
|
||||
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
|
||||
int block = s_row / Nhs;
|
||||
int s_row_block = s_row % Nhs;
|
||||
int s_col_block = s_col % Nhs;
|
||||
for (long c_row=0;c_row<Nc;c_row++) {
|
||||
for (long c_col=0;c_col<Nc;c_col++) {
|
||||
int i = s_row_block * Nc + c_row;
|
||||
int j = s_col_block * Nc + c_col;
|
||||
if(i == j)
|
||||
diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
|
||||
else if(i < j)
|
||||
triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
|
||||
else
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
|
||||
pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
|
||||
});
|
||||
}
|
||||
|
||||
static void ConvertLayout(const CloverField& full,
|
||||
CloverDiagonalField& diagonal,
|
||||
CloverTriangleField& triangle) {
|
||||
conformable(full, diagonal);
|
||||
conformable(full, triangle);
|
||||
|
||||
diagonal.Checkerboard() = full.Checkerboard();
|
||||
triangle.Checkerboard() = full.Checkerboard();
|
||||
|
||||
autoView(full_v, full, AcceleratorRead);
|
||||
autoView(diagonal_v, diagonal, AcceleratorWrite);
|
||||
autoView(triangle_v, triangle, AcceleratorWrite);
|
||||
|
||||
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
|
||||
accelerator_for(ss, full.Grid()->oSites(), 1, {
|
||||
for(int s_row = 0; s_row < Ns; s_row++) {
|
||||
for(int s_col = 0; s_col < Ns; s_col++) {
|
||||
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
|
||||
int block = s_row / Nhs;
|
||||
int s_row_block = s_row % Nhs;
|
||||
int s_col_block = s_col % Nhs;
|
||||
for(int c_row = 0; c_row < Nc; c_row++) {
|
||||
for(int c_col = 0; c_col < Nc; c_col++) {
|
||||
int i = s_row_block * Nc + c_row;
|
||||
int j = s_col_block * Nc + c_col;
|
||||
if(i == j)
|
||||
diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
|
||||
else if(i < j)
|
||||
triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
|
||||
else
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
static void ConvertLayout(const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle,
|
||||
CloverField& full) {
|
||||
conformable(full, diagonal);
|
||||
conformable(full, triangle);
|
||||
|
||||
full.Checkerboard() = diagonal.Checkerboard();
|
||||
|
||||
full = Zero();
|
||||
|
||||
autoView(diagonal_v, diagonal, AcceleratorRead);
|
||||
autoView(triangle_v, triangle, AcceleratorRead);
|
||||
autoView(full_v, full, AcceleratorWrite);
|
||||
|
||||
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
|
||||
accelerator_for(ss, full.Grid()->oSites(), 1, {
|
||||
for(int s_row = 0; s_row < Ns; s_row++) {
|
||||
for(int s_col = 0; s_col < Ns; s_col++) {
|
||||
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
|
||||
int block = s_row / Nhs;
|
||||
int s_row_block = s_row % Nhs;
|
||||
int s_col_block = s_col % Nhs;
|
||||
for(int c_row = 0; c_row < Nc; c_row++) {
|
||||
for(int c_col = 0; c_col < Nc; c_col++) {
|
||||
int i = s_row_block * Nc + c_row;
|
||||
int j = s_col_block * Nc + c_col;
|
||||
if(i == j)
|
||||
full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
|
||||
else
|
||||
full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
|
||||
// Checks/grid
|
||||
double t0 = usecond();
|
||||
conformable(diagonal, triangle);
|
||||
GridBase* grid = diagonal.Grid();
|
||||
|
||||
// Determine the boundary coordinates/sites
|
||||
double t1 = usecond();
|
||||
int t_dir = Nd - 1;
|
||||
Lattice<iScalar<vInteger>> t_coor(grid);
|
||||
LatticeCoordinate(t_coor, t_dir);
|
||||
int T = grid->GlobalDimensions()[t_dir];
|
||||
|
||||
// Set off-diagonal parts at boundary to zero -- OK
|
||||
double t2 = usecond();
|
||||
CloverTriangleField zeroTriangle(grid);
|
||||
zeroTriangle.Checkerboard() = triangle.Checkerboard();
|
||||
zeroTriangle = Zero();
|
||||
triangle = where(t_coor == 0, zeroTriangle, triangle);
|
||||
triangle = where(t_coor == T-1, zeroTriangle, triangle);
|
||||
|
||||
// Set diagonal to unity (scaled correctly) -- OK
|
||||
double t3 = usecond();
|
||||
CloverDiagonalField tmp(grid);
|
||||
tmp.Checkerboard() = diagonal.Checkerboard();
|
||||
tmp = -1.0 * csw_t + diag_mass;
|
||||
diagonal = where(t_coor == 0, tmp, diagonal);
|
||||
diagonal = where(t_coor == T-1, tmp, diagonal);
|
||||
|
||||
// Correct values next to boundary
|
||||
double t4 = usecond();
|
||||
if(cF != 1.0) {
|
||||
tmp = cF - 1.0;
|
||||
tmp += diagonal;
|
||||
diagonal = where(t_coor == 1, tmp, diagonal);
|
||||
diagonal = where(t_coor == T-2, tmp, diagonal);
|
||||
}
|
||||
|
||||
// Report timings
|
||||
double t5 = usecond();
|
||||
#if 0
|
||||
std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
|
||||
<< " checks = " << (t1 - t0) / 1e6
|
||||
<< ", coordinate = " << (t2 - t1) / 1e6
|
||||
<< ", off-diag zero = " << (t3 - t2) / 1e6
|
||||
<< ", diagonal unity = " << (t4 - t3) / 1e6
|
||||
<< ", near-boundary = " << (t5 - t4) / 1e6
|
||||
<< ", total = " << (t5 - t0) / 1e6
|
||||
<< std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Field, class Mask>
|
||||
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
|
||||
conformable(f, m);
|
||||
auto grid = f.Grid();
|
||||
const uint32_t Nsite = grid->oSites();
|
||||
const uint32_t Nsimd = grid->Nsimd();
|
||||
autoView(f_v, f, AcceleratorWrite);
|
||||
autoView(m_v, m, AcceleratorRead);
|
||||
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
|
||||
accelerator_for(ss, Nsite, Nsimd, {
|
||||
coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
|
||||
});
|
||||
}
|
||||
|
||||
template<class MaskField>
|
||||
static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
|
||||
assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
|
||||
assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
|
||||
assert(!full.Grid()->_isCheckerBoarded);
|
||||
|
||||
GridBase* grid = full.Grid();
|
||||
int t_dir = Nd-1;
|
||||
Lattice<iScalar<vInteger>> t_coor(grid);
|
||||
LatticeCoordinate(t_coor, t_dir);
|
||||
int T = grid->GlobalDimensions()[t_dir];
|
||||
|
||||
MaskField zeroMask(grid); zeroMask = Zero();
|
||||
full = 1.0;
|
||||
full = where(t_coor == 0, zeroMask, full);
|
||||
full = where(t_coor == T-1, zeroMask, full);
|
||||
|
||||
pickCheckerboard(Even, even, full);
|
||||
pickCheckerboard(Odd, odd, full);
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
92
Grid/qcd/action/fermion/WilsonCloverTypes.h
Normal file
92
Grid/qcd/action/fermion/WilsonCloverTypes.h
Normal file
@ -0,0 +1,92 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
|
||||
|
||||
Copyright (C) 2021 - 2022
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
class WilsonCloverTypes {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
||||
|
||||
typedef iImplClover<Simd> SiteClover;
|
||||
|
||||
typedef Lattice<SiteClover> CloverField;
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
class CompactWilsonCloverTypes {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
|
||||
|
||||
static constexpr int Nred = Nc * Nhs; // 6
|
||||
static constexpr int Nblock = Nhs; // 2
|
||||
static constexpr int Ndiagonal = Nred; // 6
|
||||
static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
|
||||
|
||||
template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
|
||||
template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
|
||||
|
||||
typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
|
||||
typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
|
||||
typedef iSinglet<Simd> SiteMask;
|
||||
|
||||
typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
|
||||
typedef Lattice<SiteCloverTriangle> CloverTriangleField;
|
||||
typedef Lattice<SiteMask> MaskField;
|
||||
};
|
||||
|
||||
#define INHERIT_CLOVER_TYPES(Impl) \
|
||||
typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
|
||||
typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
|
||||
|
||||
#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal SiteCloverDiagonal; \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle SiteCloverTriangle; \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::SiteMask SiteMask; \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
|
||||
typedef typename CompactWilsonCloverTypes<Impl>::MaskField MaskField; \
|
||||
/* ugly duplication but needed inside functionality classes */ \
|
||||
template<typename vtype> using iImplCloverDiagonal = \
|
||||
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
|
||||
template<typename vtype> using iImplCloverTriangle = \
|
||||
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
|
||||
|
||||
#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \
|
||||
static constexpr int Nred = CompactWilsonCloverTypes<Impl>::Nred; \
|
||||
static constexpr int Nblock = CompactWilsonCloverTypes<Impl>::Nblock; \
|
||||
static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
|
||||
static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -834,6 +834,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
|
||||
#if (!defined(GRID_HIP))
|
||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
////////////////////////////////////////////////
|
||||
// GENERAL CAYLEY CASE
|
||||
////////////////////////////////////////////////
|
||||
@ -886,7 +887,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
}
|
||||
|
||||
std::vector<RealD> G_s(Ls,1.0);
|
||||
RealD sign = 1; // sign flip for vector/tadpole
|
||||
RealD sign = 1.0; // sign flip for vector/tadpole
|
||||
if ( curr_type == Current::Axial ) {
|
||||
for(int s=0;s<Ls/2;s++){
|
||||
G_s[s] = -1.0;
|
||||
@ -896,7 +897,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
auto b=this->_b;
|
||||
auto c=this->_c;
|
||||
if ( b == 1 && c == 0 ) {
|
||||
sign = -1;
|
||||
sign = -1.0;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
|
||||
@ -940,7 +941,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
tmp = Cshift(tmp,mu,-1);
|
||||
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
|
||||
tmp = -G_s[s]*( Utmp + gmu*Utmp );
|
||||
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
|
||||
// Mask the time
|
||||
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
|
||||
unsigned int t0 = 0;
|
||||
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
|
||||
} else {
|
||||
tmp = where((lcoor>=tmin+tshift),tmp,zz);
|
||||
}
|
||||
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
|
||||
|
||||
InsertSlice(L_Q, q_out, s , 0);
|
||||
|
@ -0,0 +1,363 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
|
||||
|
||||
Copyright (C) 2017 - 2022
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
template<class Impl>
|
||||
CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
|
||||
GridCartesian& Fgrid,
|
||||
GridRedBlackCartesian& Hgrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r,
|
||||
const RealD _csw_t,
|
||||
const RealD _cF,
|
||||
const WilsonAnisotropyCoefficients& clover_anisotropy,
|
||||
const ImplParams& impl_p)
|
||||
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
|
||||
, csw_r(_csw_r)
|
||||
, csw_t(_csw_t)
|
||||
, cF(_cF)
|
||||
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||
, Diagonal(&Fgrid), Triangle(&Fgrid)
|
||||
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
|
||||
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
|
||||
, DiagonalInv(&Fgrid), TriangleInv(&Fgrid)
|
||||
, DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
|
||||
, DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid)
|
||||
, Tmp(&Fgrid)
|
||||
, BoundaryMask(&Fgrid)
|
||||
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
|
||||
{
|
||||
csw_r *= 0.5;
|
||||
csw_t *= 0.5;
|
||||
if (clover_anisotropy.isAnisotropic)
|
||||
csw_r /= clover_anisotropy.xi_0;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
if (open_boundaries)
|
||||
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::Dhop(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopOE(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopEO(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
WilsonBase::DhopDir(in, out, dir, disp);
|
||||
if(this->open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
WilsonBase::DhopDirAll(in, out);
|
||||
if(this->open_boundaries) {
|
||||
for(auto& o : out) ApplyBoundaryMask(o);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||
Mooee(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||
MooeeDag(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::Meooe(in, out);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::MeooeDag(in, out);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalEven, TriangleEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, Diagonal, Triangle);
|
||||
}
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
|
||||
Mooee(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||
}
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
|
||||
MooeeInv(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
DhopDirAll(in, out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||
assert(!open_boundaries); // TODO check for changes required for open bc
|
||||
|
||||
// NOTE: code copied from original clover term
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
conformable(X.Grid(), force.Grid());
|
||||
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||
GaugeField clover_force(force.Grid());
|
||||
PropagatorField Lambda(force.Grid());
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = Zero();
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = Zero();
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = Zero();
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
conformable(in, out);
|
||||
conformable(in, diagonal);
|
||||
conformable(in, triangle);
|
||||
|
||||
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
|
||||
// NOTE: parts copied from original implementation
|
||||
|
||||
// Import gauge into base class
|
||||
double t0 = usecond();
|
||||
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
|
||||
|
||||
// Initialize temporary variables
|
||||
double t1 = usecond();
|
||||
conformable(_Umu.Grid(), this->GaugeGrid());
|
||||
GridBase* grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
CloverField TmpOriginal(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
double t2 = usecond();
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
double t3 = usecond();
|
||||
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||
TmpOriginal += this->diag_mass;
|
||||
|
||||
// Convert the data layout of the clover term
|
||||
double t4 = usecond();
|
||||
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||
|
||||
// Possible modify the boundary values
|
||||
double t5 = usecond();
|
||||
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
|
||||
|
||||
// Invert the clover term in the improved layout
|
||||
double t6 = usecond();
|
||||
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
|
||||
|
||||
// Fill the remaining clover fields
|
||||
double t7 = usecond();
|
||||
pickCheckerboard(Even, DiagonalEven, Diagonal);
|
||||
pickCheckerboard(Even, TriangleEven, Triangle);
|
||||
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
|
||||
pickCheckerboard(Odd, TriangleOdd, Triangle);
|
||||
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
|
||||
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
|
||||
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
|
||||
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
|
||||
|
||||
// Report timings
|
||||
double t8 = usecond();
|
||||
#if 0
|
||||
std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
|
||||
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
|
||||
<< ", allocations = " << (t2 - t1) / 1e6
|
||||
<< ", field strength = " << (t3 - t2) / 1e6
|
||||
<< ", fill clover = " << (t4 - t3) / 1e6
|
||||
<< ", convert = " << (t5 - t4) / 1e6
|
||||
<< ", boundaries = " << (t6 - t5) / 1e6
|
||||
<< ", inversions = " << (t7 - t6) / 1e6
|
||||
<< ", pick cbs = " << (t8 - t7) / 1e6
|
||||
<< ", total = " << (t8 - t0) / 1e6
|
||||
<< std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -2,12 +2,13 @@
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
|
||||
|
||||
Copyright (C) 2017
|
||||
Copyright (C) 2017 - 2022
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -33,6 +34,45 @@
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField& _Umu,
|
||||
GridCartesian& Fgrid,
|
||||
GridRedBlackCartesian& Hgrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r,
|
||||
const RealD _csw_t,
|
||||
const WilsonAnisotropyCoefficients& clover_anisotropy,
|
||||
const ImplParams& impl_p)
|
||||
: WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
|
||||
, CloverTerm(&Fgrid)
|
||||
, CloverTermInv(&Fgrid)
|
||||
, CloverTermEven(&Hgrid)
|
||||
, CloverTermOdd(&Hgrid)
|
||||
, CloverTermInvEven(&Hgrid)
|
||||
, CloverTermInvOdd(&Hgrid)
|
||||
, CloverTermDagEven(&Hgrid)
|
||||
, CloverTermDagOdd(&Hgrid)
|
||||
, CloverTermInvDagEven(&Hgrid)
|
||||
, CloverTermInvDagOdd(&Hgrid) {
|
||||
assert(Nd == 4); // require 4 dimensions
|
||||
|
||||
if(clover_anisotropy.isAnisotropic) {
|
||||
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
|
||||
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
|
||||
} else {
|
||||
csw_r = _csw_r * 0.5;
|
||||
diag_mass = 4.0 + _mass;
|
||||
}
|
||||
csw_t = _csw_t * 0.5;
|
||||
|
||||
if(csw_r == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
|
||||
if(csw_t == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
}
|
||||
|
||||
// *NOT* EO
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||
@ -67,10 +107,13 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
double t0 = usecond();
|
||||
WilsonFermion<Impl>::ImportGauge(_Umu);
|
||||
double t1 = usecond();
|
||||
GridBase *grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
|
||||
double t2 = usecond();
|
||||
// Compute the field strength terms mu>nu
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
@ -79,19 +122,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
double t3 = usecond();
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
CloverTerm = fillCloverYZ(Bx) * csw_r;
|
||||
CloverTerm += fillCloverXZ(By) * csw_r;
|
||||
CloverTerm += fillCloverXY(Bz) * csw_r;
|
||||
CloverTerm += fillCloverXT(Ex) * csw_t;
|
||||
CloverTerm += fillCloverYT(Ey) * csw_t;
|
||||
CloverTerm += fillCloverZT(Ez) * csw_t;
|
||||
CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r;
|
||||
CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
|
||||
CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
|
||||
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
|
||||
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
|
||||
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
|
||||
CloverTerm += diag_mass;
|
||||
|
||||
double t4 = usecond();
|
||||
int lvol = _Umu.Grid()->lSites();
|
||||
int DimRep = Impl::Dimension;
|
||||
|
||||
double t5 = usecond();
|
||||
{
|
||||
autoView(CTv,CloverTerm,CpuRead);
|
||||
autoView(CTIv,CloverTermInv,CpuWrite);
|
||||
@ -100,7 +146,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||
peekLocalSite(Qx, CTv, lcoor);
|
||||
//if (csw!=0){
|
||||
for (int j = 0; j < Ns; j++)
|
||||
@ -125,6 +171,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
});
|
||||
}
|
||||
|
||||
double t6 = usecond();
|
||||
// Separate the even and odd parts
|
||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||
@ -137,6 +184,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
||||
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
||||
double t7 = usecond();
|
||||
|
||||
#if 0
|
||||
std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
|
||||
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
|
||||
<< ", allocations = " << (t2 - t1) / 1e6
|
||||
<< ", field strength = " << (t3 - t2) / 1e6
|
||||
<< ", fill clover = " << (t4 - t3) / 1e6
|
||||
<< ", misc = " << (t5 - t4) / 1e6
|
||||
<< ", inversions = " << (t6 - t5) / 1e6
|
||||
<< ", pick cbs = " << (t7 - t6) / 1e6
|
||||
<< ", total = " << (t7 - t0) / 1e6
|
||||
<< std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -167,7 +228,7 @@ template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
CloverFieldType *Clover;
|
||||
CloverField *Clover;
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
|
||||
if (dag)
|
||||
@ -182,12 +243,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
Helpers::multCloverField(out, *Clover, in);
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = adj(*Clover) * in;
|
||||
Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -205,18 +266,98 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
|
||||
// std::cout << "Calling clover term Even" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
Helpers::multCloverField(out, *Clover, in);
|
||||
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = *Clover * in;
|
||||
Helpers::multCloverField(out, *Clover, in);
|
||||
}
|
||||
}
|
||||
|
||||
} // MooeeInternal
|
||||
|
||||
// Derivative parts unpreconditioned pseudofermions
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
conformable(X.Grid(), force.Grid());
|
||||
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||
GaugeField clover_force(force.Grid());
|
||||
PropagatorField Lambda(force.Grid());
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = Zero();
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = Zero();
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = Zero();
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
|
@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define REGISTER
|
||||
|
||||
#ifdef GRID_SIMT
|
||||
#define LOAD_CHIMU(ptype) \
|
||||
#define LOAD_CHIMU(Ptype) \
|
||||
{const SiteSpinor & ref (in[offset]); \
|
||||
Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane); \
|
||||
Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane); \
|
||||
Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane); \
|
||||
Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane); \
|
||||
Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane); \
|
||||
Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane); \
|
||||
Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane); \
|
||||
Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane); \
|
||||
Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane); \
|
||||
Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane); \
|
||||
Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane); \
|
||||
Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane); }
|
||||
Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane); \
|
||||
Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane); \
|
||||
Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane); \
|
||||
Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane); \
|
||||
Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane); \
|
||||
Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane); \
|
||||
Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane); \
|
||||
Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane); \
|
||||
Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane); \
|
||||
Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane); \
|
||||
Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane); \
|
||||
Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane); }
|
||||
#define PERMUTE_DIR(dir) ;
|
||||
#else
|
||||
#define LOAD_CHIMU(ptype) \
|
||||
#define LOAD_CHIMU(Ptype) \
|
||||
{const SiteSpinor & ref (in[offset]); \
|
||||
Chimu_00=ref()(0)(0);\
|
||||
Chimu_01=ref()(0)(1);\
|
||||
@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Chimu_32=ref()(3)(2);}
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_00,Chi_00); \
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
permute##dir(Chi_02,Chi_02);\
|
||||
permute##dir(Chi_10,Chi_10); \
|
||||
permute##dir(Chi_11,Chi_11);\
|
||||
permute##dir(Chi_12,Chi_12);
|
||||
permute##dir(Chi_00,Chi_00); \
|
||||
permute##dir(Chi_01,Chi_01); \
|
||||
permute##dir(Chi_02,Chi_02); \
|
||||
permute##dir(Chi_10,Chi_10); \
|
||||
permute##dir(Chi_11,Chi_11); \
|
||||
permute##dir(Chi_12,Chi_12);
|
||||
|
||||
#endif
|
||||
|
||||
@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
result_32-= UChi_12;
|
||||
|
||||
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON;
|
||||
{int ptype; \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
auto offset = SE->_offset; \
|
||||
auto local = SE->_is_local; \
|
||||
auto perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; }
|
||||
|
||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||
SE=&st_p[DIR+8*ss]; \
|
||||
ptype=st_perm[DIR]; \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON;
|
||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||
{ SE=&st_p[DIR+8*ss]; \
|
||||
auto ptype=st_perm[DIR]; \
|
||||
auto offset = SE->_offset; \
|
||||
auto local = SE->_is_local; \
|
||||
auto perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; }
|
||||
|
||||
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \
|
||||
SE=&st_p[DIR+8*ss]; \
|
||||
ptype=st_perm[DIR]; \
|
||||
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
|
||||
offset = SE->_offset; \
|
||||
perm = SE->_permute; \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON;
|
||||
{ SE=&st_p[DIR+8*ss]; \
|
||||
auto ptype=st_perm[DIR]; \
|
||||
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
|
||||
auto offset = SE->_offset; \
|
||||
auto perm = SE->_permute; \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; }
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else if ( st.same_node[DIR] ) { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
if (local || st.same_node[DIR] ) { \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
} \
|
||||
acceleratorSynchronise();
|
||||
{ int ptype; \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
auto offset = SE->_offset; \
|
||||
auto local = SE->_is_local; \
|
||||
auto perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU(PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else if ( st.same_node[DIR] ) { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
if (local || st.same_node[DIR] ) { \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
} \
|
||||
acceleratorSynchronise(); }
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||
LOAD_CHI; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
} \
|
||||
acceleratorSynchronise();
|
||||
{ int ptype; \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
auto offset = SE->_offset; \
|
||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||
LOAD_CHI; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
} \
|
||||
acceleratorSynchronise(); }
|
||||
|
||||
#define HAND_RESULT(ss) \
|
||||
{ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
#define HAND_RESULT(ss) \
|
||||
{ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
coalescedWrite(ref()(0)(0),result_00,lane); \
|
||||
coalescedWrite(ref()(0)(1),result_01,lane); \
|
||||
coalescedWrite(ref()(0)(2),result_02,lane); \
|
||||
@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
|
||||
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
||||
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
@ -640,8 +638,8 @@ template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
auto st_p = st._entries_p;
|
||||
auto st_perm = st._permute_type;
|
||||
// auto st_p = st._entries_p;
|
||||
// auto st_perm = st._permute_type;
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||
@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
auto st_p = st._entries_p;
|
||||
auto st_perm = st._permute_type;
|
||||
// auto st_p = st._entries_p;
|
||||
// auto st_perm = st._permute_type;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||
@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
@ -699,8 +695,8 @@ template<class Impl> accelerator_inline void
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
auto st_p = st._entries_p;
|
||||
auto st_perm = st._permute_type;
|
||||
// auto st_p = st._entries_p;
|
||||
// auto st_perm = st._permute_type;
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
||||
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
int offset, ptype;
|
||||
// int offset, ptype;
|
||||
StencilEntry *SE;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
@ -730,8 +726,8 @@ template<class Impl> accelerator_inline
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
auto st_p = st._entries_p;
|
||||
auto st_perm = st._permute_type;
|
||||
// auto st_p = st._entries_p;
|
||||
// auto st_perm = st._permute_type;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||
@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
|
||||
HAND_DECLARATIONS(Simt);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, ptype;
|
||||
// int offset, ptype;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
|
@ -0,0 +1,41 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
|
||||
|
||||
Copyright (C) 2017 - 2022
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class CompactWilsonCloverFermion<IMPLEMENTATION>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermionInstantiation.cc.master
|
@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermionInstantiation.cc.master
|
@ -40,7 +40,7 @@ EOF
|
||||
|
||||
done
|
||||
|
||||
CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
|
||||
CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
|
||||
|
||||
for impl in $WILSON_IMPL_LIST
|
||||
do
|
||||
|
@ -78,6 +78,8 @@ public:
|
||||
typedef Lattice<SiteLink> LinkField;
|
||||
typedef Lattice<SiteField> Field;
|
||||
|
||||
typedef SU<Nrepresentation> Group;
|
||||
|
||||
// Guido: we can probably separate the types from the HMC functions
|
||||
// this will create 2 kind of implementations
|
||||
// probably confusing the users
|
||||
@ -118,7 +120,7 @@ public:
|
||||
LinkField Pmu(P.Grid());
|
||||
Pmu = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
|
||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
|
||||
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
|
||||
Pmu = Pmu*scale;
|
||||
PokeIndex<LorentzIndex>(P, Pmu, mu);
|
||||
@ -159,15 +161,15 @@ public:
|
||||
}
|
||||
|
||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
SU<Nc>::HotConfiguration(pRNG, U);
|
||||
Group::HotConfiguration(pRNG, U);
|
||||
}
|
||||
|
||||
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
SU<Nc>::TepidConfiguration(pRNG, U);
|
||||
Group::TepidConfiguration(pRNG, U);
|
||||
}
|
||||
|
||||
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
SU<Nc>::ColdConfiguration(pRNG, U);
|
||||
Group::ColdConfiguration(pRNG, U);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1,61 +1,63 @@
|
||||
Using HMC in Grid version 0.5.1
|
||||
# Using HMC in Grid
|
||||
|
||||
These are the instructions to use the Generalised HMC on Grid version 0.5.1.
|
||||
Disclaimer: GRID is still under active development so any information here can be changed in future releases.
|
||||
These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
|
||||
Disclaimer: Grid is still under active development so any information here can be changed in future releases.
|
||||
|
||||
|
||||
Command line options
|
||||
===================
|
||||
(relevant file GenericHMCrunner.h)
|
||||
## Command line options
|
||||
|
||||
(relevant file `GenericHMCrunner.h`)
|
||||
The initial configuration can be changed at the command line using
|
||||
--StartType <your choice>
|
||||
valid choices, one among these
|
||||
HotStart, ColdStart, TepidStart, CheckpointStart
|
||||
default: HotStart
|
||||
`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
|
||||
`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
|
||||
Default: `--StartingType HotStart`
|
||||
|
||||
example
|
||||
./My_hmc_exec --StartType HotStart
|
||||
Example:
|
||||
```
|
||||
./My_hmc_exec --StartingType HotStart
|
||||
```
|
||||
|
||||
The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
|
||||
--StartTrajectory <integer>
|
||||
default: 0
|
||||
The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
|
||||
`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
|
||||
Default: `--StartingTrajectory 0`
|
||||
|
||||
The number of trajectories for a specific run are specified at command line by
|
||||
--Trajectories <integer>
|
||||
default: 1
|
||||
`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
|
||||
Default: `--Trajectories 1`
|
||||
|
||||
The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
|
||||
--Thermalizations <integer>
|
||||
default: 10
|
||||
|
||||
`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
|
||||
Default: `--Thermalizations 10`
|
||||
|
||||
Any other parameter is defined in the source for the executable.
|
||||
|
||||
HMC controls
|
||||
===========
|
||||
## HMC controls
|
||||
|
||||
The lines
|
||||
|
||||
```
|
||||
std::vector<int> SerSeed({1, 2, 3, 4, 5});
|
||||
std::vector<int> ParSeed({6, 7, 8, 9, 10});
|
||||
```
|
||||
|
||||
define the seeds for the serial and the parallel RNG.
|
||||
|
||||
The line
|
||||
|
||||
```
|
||||
TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
|
||||
```
|
||||
|
||||
declares the number of molecular dynamics steps and the total trajectory length.
|
||||
|
||||
|
||||
Actions
|
||||
======
|
||||
## Actions
|
||||
|
||||
Action names are defined in the file
|
||||
lib/qcd/Actions.h
|
||||
Action names are defined in the directory `Grid/qcd/action`.
|
||||
|
||||
Gauge actions list:
|
||||
Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
|
||||
|
||||
```
|
||||
WilsonGaugeActionR;
|
||||
WilsonGaugeActionF;
|
||||
WilsonGaugeActionD;
|
||||
@ -68,8 +70,9 @@ IwasakiGaugeActionD;
|
||||
SymanzikGaugeActionR;
|
||||
SymanzikGaugeActionF;
|
||||
SymanzikGaugeActionD;
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
ConjugateWilsonGaugeActionR;
|
||||
ConjugateWilsonGaugeActionF;
|
||||
ConjugateWilsonGaugeActionD;
|
||||
@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
|
||||
ConjugateSymanzikGaugeActionR;
|
||||
ConjugateSymanzikGaugeActionF;
|
||||
ConjugateSymanzikGaugeActionD;
|
||||
```
|
||||
|
||||
Each of these action accepts one single parameter at creation time (beta).
|
||||
Example for creating a Symanzik action with beta=4.0
|
||||
|
||||
```
|
||||
SymanzikGaugeActionR(4.0)
|
||||
```
|
||||
|
||||
Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
|
||||
|
||||
```
|
||||
ScalarActionR;
|
||||
ScalarActionF;
|
||||
ScalarActionD;
|
||||
```
|
||||
|
||||
|
||||
each of these action accept one single parameter at creation time (beta).
|
||||
Example for creating a Symanzik action with beta=4.0
|
||||
|
||||
SymanzikGaugeActionR(4.0)
|
||||
|
||||
The suffixes R,F,D in the action names refer to the Real
|
||||
(the precision is defined at compile time by the --enable-precision flag in the configure),
|
||||
Float and Double, that force the precision of the action to be 32, 64 bit respectively.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
The suffixes `R`, `F`, `D` in the action names refer to the `Real`
|
||||
(the precision is defined at compile time by the `--enable-precision` flag in the configure),
|
||||
`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.
|
||||
|
35
Grid/serialisation/BaseIO.cc
Normal file
35
Grid/serialisation/BaseIO.cc
Normal file
@ -0,0 +1,35 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/serialisation/BaseIO.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid)
|
||||
|
||||
std::uint64_t EigenIO::EigenResizeCounter(0);
|
||||
|
||||
NAMESPACE_END(Grid)
|
@ -9,6 +9,7 @@
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -30,6 +31,7 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
#ifndef GRID_SERIALISATION_ABSTRACT_READER_H
|
||||
#define GRID_SERIALISATION_ABSTRACT_READER_H
|
||||
|
||||
#include <atomic>
|
||||
#include <type_traits>
|
||||
#include <Grid/tensors/Tensors.h>
|
||||
#include <Grid/serialisation/VectorUtils.h>
|
||||
@ -110,6 +112,10 @@ namespace Grid {
|
||||
template <typename ET>
|
||||
inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
|
||||
getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
|
||||
|
||||
// Counter for resized EigenTensors (poor man's substitute for allocator)
|
||||
// Defined in BinaryIO.cc
|
||||
extern std::uint64_t EigenResizeCounter;
|
||||
}
|
||||
|
||||
// Abstract writer/reader classes ////////////////////////////////////////////
|
||||
@ -497,8 +503,14 @@ namespace Grid {
|
||||
typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
|
||||
Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// The memory counter is the reason this must be done from the primary thread
|
||||
assert(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread");
|
||||
#endif
|
||||
EigenIO::EigenResizeCounter -= static_cast<uint64_t>(t.size()) * sizeof(typename ETensor::Scalar);
|
||||
//t.reshape( dims );
|
||||
t.resize( dims );
|
||||
EigenIO::EigenResizeCounter += static_cast<uint64_t>(t.size()) * sizeof(typename ETensor::Scalar);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -1,8 +1,39 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./Grid/serialisation/VectorUtils.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Author: Peter Boyle <paboyle@ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
#ifndef H5_NO_NAMESPACE
|
||||
using namespace H5NS;
|
||||
using namespace H5NS; // Compile error here? Try adding --enable-cxx to hdf5 configure
|
||||
#endif
|
||||
|
||||
// Writer implementation ///////////////////////////////////////////////////////
|
||||
|
@ -1,3 +1,34 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./Grid/serialisation/VectorUtils.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ed.ac.uk>
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#ifndef GRID_SERIALISATION_HDF5_H
|
||||
#define GRID_SERIALISATION_HDF5_H
|
||||
|
||||
@ -9,10 +40,6 @@
|
||||
#include <Grid/tensors/Tensors.h>
|
||||
#include "Hdf5Type.h"
|
||||
|
||||
#ifndef H5_NO_NAMESPACE
|
||||
#define H5NS H5
|
||||
#endif
|
||||
|
||||
// default thresold above which datasets are used instead of attributes
|
||||
#ifndef HDF5_DEF_DATASET_THRES
|
||||
#define HDF5_DEF_DATASET_THRES 6u
|
||||
@ -34,11 +61,13 @@ namespace Grid
|
||||
template <typename U>
|
||||
void writeDefault(const std::string &s, const U &x);
|
||||
template <typename U>
|
||||
typename std::enable_if<element<std::vector<U>>::is_number, void>::type
|
||||
void writeRagged(const std::string &s, const std::vector<U> &x);
|
||||
template <typename U>
|
||||
typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
|
||||
writeDefault(const std::string &s, const std::vector<U> &x);
|
||||
template <typename U>
|
||||
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
|
||||
writeDefault(const std::string &s, const std::vector<U> &x);
|
||||
typename std::enable_if<!is_flattenable<std::vector<U>>::value>::type
|
||||
writeDefault(const std::string &s, const std::vector<U> &x) { writeRagged(s, x); }
|
||||
template <typename U>
|
||||
void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
|
||||
H5NS::Group & getGroup(void);
|
||||
@ -64,11 +93,13 @@ namespace Grid
|
||||
template <typename U>
|
||||
void readDefault(const std::string &s, U &output);
|
||||
template <typename U>
|
||||
typename std::enable_if<element<std::vector<U>>::is_number, void>::type
|
||||
void readRagged(const std::string &s, std::vector<U> &x);
|
||||
template <typename U>
|
||||
typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
|
||||
readDefault(const std::string &s, std::vector<U> &x);
|
||||
template <typename U>
|
||||
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
|
||||
readDefault(const std::string &s, std::vector<U> &x);
|
||||
typename std::enable_if<!is_flattenable<std::vector<U>>::value>::type
|
||||
readDefault(const std::string &s, std::vector<U> &x) { readRagged(s, x); }
|
||||
template <typename U>
|
||||
void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
|
||||
H5NS::Group & getGroup(void);
|
||||
@ -176,24 +207,30 @@ namespace Grid
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
typename std::enable_if<element<std::vector<U>>::is_number, void>::type
|
||||
typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
|
||||
Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
|
||||
{
|
||||
// alias to element type
|
||||
typedef typename element<std::vector<U>>::type Element;
|
||||
|
||||
// flatten the vector and getting dimensions
|
||||
Flatten<std::vector<U>> flat(x);
|
||||
std::vector<size_t> dim;
|
||||
const auto &flatx = flat.getFlatVector();
|
||||
for (auto &d: flat.getDim())
|
||||
dim.push_back(d);
|
||||
writeMultiDim<Element>(s, dim, &flatx[0], flatx.size());
|
||||
if (isRegularShape(x))
|
||||
{
|
||||
// alias to element type
|
||||
using Scalar = typename is_flattenable<std::vector<U>>::type;
|
||||
|
||||
// flatten the vector and getting dimensions
|
||||
Flatten<std::vector<U>> flat(x);
|
||||
std::vector<size_t> dim;
|
||||
const auto &flatx = flat.getFlatVector();
|
||||
for (auto &d: flat.getDim())
|
||||
dim.push_back(d);
|
||||
writeMultiDim<Scalar>(s, dim, &flatx[0], flatx.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
writeRagged(s, x);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
|
||||
Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
|
||||
void Hdf5Writer::writeRagged(const std::string &s, const std::vector<U> &x)
|
||||
{
|
||||
push(s);
|
||||
writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size",
|
||||
@ -229,7 +266,7 @@ namespace Grid
|
||||
void Hdf5Reader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
|
||||
{
|
||||
// alias to element type
|
||||
typedef typename element<std::vector<U>>::type Element;
|
||||
using Scalar = typename is_flattenable<std::vector<U>>::type;
|
||||
|
||||
// read the dimensions
|
||||
H5NS::DataSpace dataSpace;
|
||||
@ -260,37 +297,44 @@ namespace Grid
|
||||
H5NS::DataSet dataSet;
|
||||
|
||||
dataSet = group_.openDataSet(s);
|
||||
dataSet.read(buf.data(), Hdf5Type<Element>::type());
|
||||
dataSet.read(buf.data(), Hdf5Type<Scalar>::type());
|
||||
}
|
||||
else
|
||||
{
|
||||
H5NS::Attribute attribute;
|
||||
|
||||
attribute = group_.openAttribute(s);
|
||||
attribute.read(Hdf5Type<Element>::type(), buf.data());
|
||||
attribute.read(Hdf5Type<Scalar>::type(), buf.data());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
typename std::enable_if<element<std::vector<U>>::is_number, void>::type
|
||||
typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
|
||||
Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
|
||||
{
|
||||
// alias to element type
|
||||
typedef typename element<std::vector<U>>::type Element;
|
||||
if (H5Lexists (group_.getId(), s.c_str(), H5P_DEFAULT) > 0
|
||||
&& H5Aexists_by_name(group_.getId(), s.c_str(), HDF5_GRID_GUARD "vector_size", H5P_DEFAULT ) > 0)
|
||||
{
|
||||
readRagged(s, x);
|
||||
}
|
||||
else
|
||||
{
|
||||
// alias to element type
|
||||
using Scalar = typename is_flattenable<std::vector<U>>::type;
|
||||
|
||||
std::vector<size_t> dim;
|
||||
std::vector<Element> buf;
|
||||
readMultiDim( s, buf, dim );
|
||||
std::vector<size_t> dim;
|
||||
std::vector<Scalar> buf;
|
||||
readMultiDim( s, buf, dim );
|
||||
|
||||
// reconstruct the multidimensional vector
|
||||
Reconstruct<std::vector<U>> r(buf, dim);
|
||||
|
||||
x = r.getVector();
|
||||
// reconstruct the multidimensional vector
|
||||
Reconstruct<std::vector<U>> r(buf, dim);
|
||||
|
||||
x = r.getVector();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
|
||||
Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
|
||||
void Hdf5Reader::readRagged(const std::string &s, std::vector<U> &x)
|
||||
{
|
||||
uint64_t size;
|
||||
|
||||
|
@ -5,7 +5,9 @@
|
||||
#include <complex>
|
||||
#include <memory>
|
||||
|
||||
#ifndef H5_NO_NAMESPACE
|
||||
#ifdef H5_NO_NAMESPACE
|
||||
#define H5NS
|
||||
#else
|
||||
#define H5NS H5
|
||||
#endif
|
||||
|
||||
|
@ -118,13 +118,13 @@ static inline std::string SerialisableClassName(void) {return std::string(#cname
|
||||
static constexpr bool isEnum = false; \
|
||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
|
||||
template <typename T>\
|
||||
static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
|
||||
static inline void write(::Grid::Writer<T> &WR,const std::string &s, const cname &obj){ \
|
||||
push(WR,s);\
|
||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
|
||||
pop(WR);\
|
||||
}\
|
||||
template <typename T>\
|
||||
static inline void read(Reader<T> &RD,const std::string &s, cname &obj){ \
|
||||
static inline void read(::Grid::Reader<T> &RD,const std::string &s, cname &obj){ \
|
||||
if (!push(RD,s))\
|
||||
{\
|
||||
std::cout << ::Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl; \
|
||||
|
@ -9,7 +9,8 @@
|
||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
Author: Michael Marshall <michael.marshall@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
@ -236,21 +237,36 @@ namespace Grid {
|
||||
}
|
||||
}
|
||||
|
||||
// Vector element trait //////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
struct element
|
||||
// is_flattenable<T>::value is true if T is a std::vector<> which can be flattened //////////////////////
|
||||
template <typename T, typename V = void>
|
||||
struct is_flattenable : std::false_type
|
||||
{
|
||||
typedef T type;
|
||||
static constexpr bool is_number = false;
|
||||
using type = T;
|
||||
using grid_type = T;
|
||||
static constexpr int vecRank = 0;
|
||||
static constexpr bool isGridTensor = false;
|
||||
static constexpr bool children_flattenable = std::is_arithmetic<T>::value or is_complex<T>::value;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct element<std::vector<T>>
|
||||
struct is_flattenable<T, typename std::enable_if<isGridTensor<T>::value>::type> : std::false_type
|
||||
{
|
||||
typedef typename element<T>::type type;
|
||||
static constexpr bool is_number = std::is_arithmetic<T>::value
|
||||
or is_complex<T>::value
|
||||
or element<T>::is_number;
|
||||
using type = typename GridTypeMapper<T>::scalar_type;
|
||||
using grid_type = T;
|
||||
static constexpr int vecRank = 0;
|
||||
static constexpr bool isGridTensor = true;
|
||||
static constexpr bool children_flattenable = true;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_flattenable<std::vector<T>, typename std::enable_if<is_flattenable<T>::children_flattenable>::type>
|
||||
: std::true_type
|
||||
{
|
||||
using type = typename is_flattenable<T>::type;
|
||||
using grid_type = typename is_flattenable<T>::grid_type;
|
||||
static constexpr bool isGridTensor = is_flattenable<T>::isGridTensor;
|
||||
static constexpr int vecRank = is_flattenable<T>::vecRank + 1;
|
||||
static constexpr bool children_flattenable = true;
|
||||
};
|
||||
|
||||
// Vector flattening utility class ////////////////////////////////////////////
|
||||
@ -259,23 +275,30 @@ namespace Grid {
|
||||
class Flatten
|
||||
{
|
||||
public:
|
||||
typedef typename element<V>::type Element;
|
||||
using Scalar = typename is_flattenable<V>::type;
|
||||
static constexpr bool isGridTensor = is_flattenable<V>::isGridTensor;
|
||||
public:
|
||||
explicit Flatten(const V &vector);
|
||||
const V & getVector(void);
|
||||
const std::vector<Element> & getFlatVector(void);
|
||||
const std::vector<size_t> & getDim(void);
|
||||
explicit Flatten(const V &vector);
|
||||
const V & getVector(void) const { return vector_; }
|
||||
const std::vector<Scalar> & getFlatVector(void) const { return flatVector_; }
|
||||
const std::vector<size_t> & getDim(void) const { return dim_; }
|
||||
private:
|
||||
void accumulate(const Element &e);
|
||||
template <typename W>
|
||||
void accumulate(const W &v);
|
||||
void accumulateDim(const Element &e);
|
||||
template <typename W>
|
||||
void accumulateDim(const W &v);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
|
||||
accumulate(const W &e);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
accumulate(const W &e);
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
|
||||
accumulate(const W &v);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
|
||||
accumulateDim(const W &e) {} // Innermost is a scalar - do nothing
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
accumulateDim(const W &e);
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
|
||||
accumulateDim(const W &v);
|
||||
private:
|
||||
const V &vector_;
|
||||
std::vector<Element> flatVector_;
|
||||
std::vector<size_t> dim_;
|
||||
const V &vector_;
|
||||
std::vector<Scalar> flatVector_;
|
||||
std::vector<size_t> dim_;
|
||||
};
|
||||
|
||||
// Class to reconstruct a multidimensional std::vector
|
||||
@ -283,38 +306,57 @@ namespace Grid {
|
||||
class Reconstruct
|
||||
{
|
||||
public:
|
||||
typedef typename element<V>::type Element;
|
||||
using Scalar = typename is_flattenable<V>::type;
|
||||
static constexpr bool isGridTensor = is_flattenable<V>::isGridTensor;
|
||||
public:
|
||||
Reconstruct(const std::vector<Element> &flatVector,
|
||||
Reconstruct(const std::vector<Scalar> &flatVector,
|
||||
const std::vector<size_t> &dim);
|
||||
const V & getVector(void);
|
||||
const std::vector<Element> & getFlatVector(void);
|
||||
const std::vector<size_t> & getDim(void);
|
||||
const V & getVector(void) const { return vector_; }
|
||||
const std::vector<Scalar> & getFlatVector(void) const { return flatVector_; }
|
||||
const std::vector<size_t> & getDim(void) const { return dim_; }
|
||||
private:
|
||||
void fill(std::vector<Element> &v);
|
||||
template <typename W>
|
||||
void fill(W &v);
|
||||
void resize(std::vector<Element> &v, const unsigned int dim);
|
||||
template <typename W>
|
||||
void resize(W &v, const unsigned int dim);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
|
||||
fill(W &v);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
fill(W &v);
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
|
||||
fill(W &v);
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::value && is_flattenable<W>::vecRank==1>::type
|
||||
resize(W &v, const unsigned int dim);
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::value && (is_flattenable<W>::vecRank>1)>::type
|
||||
resize(W &v, const unsigned int dim);
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::isGridTensor>::type
|
||||
checkInnermost(const W &e) {} // Innermost is a scalar - do nothing
|
||||
template <typename W> typename std::enable_if< is_flattenable<W>::isGridTensor>::type
|
||||
checkInnermost(const W &e);
|
||||
private:
|
||||
V vector_;
|
||||
const std::vector<Element> &flatVector_;
|
||||
std::vector<size_t> dim_;
|
||||
size_t ind_{0};
|
||||
unsigned int dimInd_{0};
|
||||
V vector_;
|
||||
const std::vector<Scalar> &flatVector_;
|
||||
std::vector<size_t> dim_;
|
||||
size_t ind_{0};
|
||||
unsigned int dimInd_{0};
|
||||
};
|
||||
|
||||
// Flatten class template implementation
|
||||
template <typename V>
|
||||
void Flatten<V>::accumulate(const Element &e)
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
|
||||
Flatten<V>::accumulate(const W &e)
|
||||
{
|
||||
flatVector_.push_back(e);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
template <typename W>
|
||||
void Flatten<V>::accumulate(const W &v)
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
Flatten<V>::accumulate(const W &e)
|
||||
{
|
||||
for (const Scalar &x: e) {
|
||||
flatVector_.push_back(x);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
|
||||
Flatten<V>::accumulate(const W &v)
|
||||
{
|
||||
for (auto &e: v)
|
||||
{
|
||||
@ -323,11 +365,17 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
void Flatten<V>::accumulateDim(const Element &e) {};
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
Flatten<V>::accumulateDim(const W &e)
|
||||
{
|
||||
using Traits = GridTypeMapper<typename is_flattenable<W>::grid_type>;
|
||||
for (int rank=0; rank < Traits::Rank; ++rank)
|
||||
dim_.push_back(Traits::Dimension(rank));
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
template <typename W>
|
||||
void Flatten<V>::accumulateDim(const W &v)
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
|
||||
Flatten<V>::accumulateDim(const W &v)
|
||||
{
|
||||
dim_.push_back(v.size());
|
||||
accumulateDim(v[0]);
|
||||
@ -337,42 +385,36 @@ namespace Grid {
|
||||
Flatten<V>::Flatten(const V &vector)
|
||||
: vector_(vector)
|
||||
{
|
||||
accumulate(vector_);
|
||||
accumulateDim(vector_);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const V & Flatten<V>::getVector(void)
|
||||
{
|
||||
return vector_;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const std::vector<typename Flatten<V>::Element> &
|
||||
Flatten<V>::getFlatVector(void)
|
||||
{
|
||||
return flatVector_;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const std::vector<size_t> & Flatten<V>::getDim(void)
|
||||
{
|
||||
return dim_;
|
||||
std::size_t TotalSize{ dim_[0] };
|
||||
for (int i = 1; i < dim_.size(); ++i) {
|
||||
TotalSize *= dim_[i];
|
||||
}
|
||||
flatVector_.reserve(TotalSize);
|
||||
accumulate(vector_);
|
||||
}
|
||||
|
||||
// Reconstruct class template implementation
|
||||
template <typename V>
|
||||
void Reconstruct<V>::fill(std::vector<Element> &v)
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
|
||||
Reconstruct<V>::fill(W &v)
|
||||
{
|
||||
v = flatVector_[ind_++];
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
|
||||
Reconstruct<V>::fill(W &v)
|
||||
{
|
||||
for (auto &e: v)
|
||||
{
|
||||
e = flatVector_[ind_++];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename V>
|
||||
template <typename W>
|
||||
void Reconstruct<V>::fill(W &v)
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
|
||||
Reconstruct<V>::fill(W &v)
|
||||
{
|
||||
for (auto &e: v)
|
||||
{
|
||||
@ -381,14 +423,15 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::value && is_flattenable<W>::vecRank==1>::type
|
||||
Reconstruct<V>::resize(W &v, const unsigned int dim)
|
||||
{
|
||||
v.resize(dim_[dim]);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
template <typename W>
|
||||
void Reconstruct<V>::resize(W &v, const unsigned int dim)
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::value && (is_flattenable<W>::vecRank>1)>::type
|
||||
Reconstruct<V>::resize(W &v, const unsigned int dim)
|
||||
{
|
||||
v.resize(dim_[dim]);
|
||||
for (auto &e: v)
|
||||
@ -398,34 +441,31 @@ namespace Grid {
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
|
||||
template <typename W> typename std::enable_if<is_flattenable<W>::isGridTensor>::type
|
||||
Reconstruct<V>::checkInnermost(const W &)
|
||||
{
|
||||
using Traits = GridTypeMapper<typename is_flattenable<W>::grid_type>;
|
||||
const int gridRank{Traits::Rank};
|
||||
const int dimRank{static_cast<int>(dim_.size())};
|
||||
assert(dimRank >= gridRank && "Tensor rank too low for Grid tensor");
|
||||
for (int i=0; i<gridRank; ++i) {
|
||||
assert(dim_[dimRank - gridRank + i] == Traits::Dimension(i) && "Tensor dimension doesn't match Grid tensor");
|
||||
}
|
||||
dim_.resize(dimRank - gridRank);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
Reconstruct<V>::Reconstruct(const std::vector<Scalar> &flatVector,
|
||||
const std::vector<size_t> &dim)
|
||||
: flatVector_(flatVector)
|
||||
, dim_(dim)
|
||||
{
|
||||
checkInnermost(vector_);
|
||||
assert(dim_.size() == is_flattenable<V>::vecRank && "Tensor rank doesn't match nested std::vector rank");
|
||||
resize(vector_, 0);
|
||||
fill(vector_);
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const V & Reconstruct<V>::getVector(void)
|
||||
{
|
||||
return vector_;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const std::vector<typename Reconstruct<V>::Element> &
|
||||
Reconstruct<V>::getFlatVector(void)
|
||||
{
|
||||
return flatVector_;
|
||||
}
|
||||
|
||||
template <typename V>
|
||||
const std::vector<size_t> & Reconstruct<V>::getDim(void)
|
||||
{
|
||||
return dim_;
|
||||
}
|
||||
|
||||
// Vector IO utilities ///////////////////////////////////////////////////////
|
||||
// helper function to read space-separated values
|
||||
template <typename T>
|
||||
@ -459,6 +499,64 @@ namespace Grid {
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
// In general, scalar types are considered "flattenable" (regularly shaped)
|
||||
template <typename T>
|
||||
bool isRegularShapeHelper(const std::vector<T> &, std::vector<std::size_t> &, int, bool)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool isRegularShapeHelper(const std::vector<std::vector<T>> &v, std::vector<std::size_t> &Dims, int Depth, bool bFirst)
|
||||
{
|
||||
if( bFirst)
|
||||
{
|
||||
assert( Dims.size() == Depth && "Bug: Delete this message after testing" );
|
||||
Dims.push_back(v[0].size());
|
||||
if (!Dims[Depth])
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" );
|
||||
}
|
||||
for (std::size_t i = 0; i < v.size(); ++i)
|
||||
{
|
||||
if (v[i].size() != Dims[Depth] || !isRegularShapeHelper(v[i], Dims, Depth + 1, bFirst && i==0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool isRegularShape(const T &t) { return true; }
|
||||
|
||||
template <typename T>
|
||||
bool isRegularShape(const std::vector<T> &v) { return !v.empty(); }
|
||||
|
||||
// Return non-zero if all dimensions of this std::vector<std::vector<T>> are regularly shaped
|
||||
template <typename T>
|
||||
bool isRegularShape(const std::vector<std::vector<T>> &v)
|
||||
{
|
||||
if (v.empty() || v[0].empty())
|
||||
return false;
|
||||
// Make sure all of my rows are the same size
|
||||
std::vector<std::size_t> Dims;
|
||||
Dims.reserve(is_flattenable<T>::vecRank);
|
||||
Dims.push_back(v.size());
|
||||
Dims.push_back(v[0].size());
|
||||
for (std::size_t i = 0; i < Dims[0]; ++i)
|
||||
{
|
||||
if (v[i].size() != Dims[1] || !isRegularShapeHelper(v[i], Dims, 2, i==0))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// helper function to read space-separated values
|
||||
|
@ -322,25 +322,12 @@ public:
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
// int recv_from_rank;
|
||||
// int xmit_to_rank;
|
||||
|
||||
if ( ! comm_dim ) return 1;
|
||||
|
||||
int nbr_proc;
|
||||
if (displacement>0) nbr_proc = 1;
|
||||
else nbr_proc = pd-1;
|
||||
|
||||
// FIXME this logic needs to be sorted for three link term
|
||||
// assert( (displacement==1) || (displacement==-1));
|
||||
// Present hack only works for >= 4^4 subvol per node
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
|
||||
|
||||
if ( (shm==NULL) || Stencil_force_mpi ) return 0;
|
||||
|
||||
return 1;
|
||||
if ( displacement == 0 ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
@ -1020,7 +1007,6 @@ public:
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
int shm_receive_only = 1;
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
@ -1052,10 +1038,6 @@ public:
|
||||
assert (xmit_to_rank != _grid->ThisRank());
|
||||
assert (recv_from_rank != _grid->ThisRank());
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// try the direct copy if possible
|
||||
/////////////////////////////////////////////////////////
|
||||
cobj *send_buf;
|
||||
cobj *recv_buf;
|
||||
if ( compress.DecompressionStep() ) {
|
||||
recv_buf=u_simd_recv_buf[0];
|
||||
@ -1063,52 +1045,36 @@ public:
|
||||
recv_buf=this->u_recv_buf_p;
|
||||
}
|
||||
|
||||
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
|
||||
if ( (send_buf==NULL) || Stencil_force_mpi ) {
|
||||
send_buf = this->u_send_buf_p;
|
||||
}
|
||||
|
||||
// Find out if we get the direct copy.
|
||||
void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p);
|
||||
if ((success==NULL)||Stencil_force_mpi) {
|
||||
// we found a packet that comes from MPI and contributes to this leg of stencil
|
||||
shm_receive_only = 0;
|
||||
}
|
||||
cobj *send_buf;
|
||||
send_buf = this->u_send_buf_p; // Gather locally, must send
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Gather locally
|
||||
////////////////////////////////////////////////////////
|
||||
gathertime-=usecond();
|
||||
assert(send_buf!=NULL);
|
||||
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
|
||||
gathertime+=usecond();
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Build a list of things to do after we synchronise GPUs
|
||||
// Start comms now???
|
||||
///////////////////////////////////////////////////////////
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&recv_buf[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
if ( compress.DecompressionStep() ) {
|
||||
|
||||
if ( shm_receive_only ) { // Early decompress before MPI is finished is possible
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,DecompressionsSHM);
|
||||
} else { // Decompress after MPI is finished
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,Decompressions);
|
||||
}
|
||||
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&recv_buf[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
} else {
|
||||
AddPacket((void *)&send_buf[u_comm_offset],
|
||||
(void *)&this->u_recv_buf_p[u_comm_offset],
|
||||
xmit_to_rank,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
AddDecompress(&this->u_recv_buf_p[u_comm_offset],
|
||||
&recv_buf[u_comm_offset],
|
||||
words,Decompressions);
|
||||
}
|
||||
u_comm_offset+=words;
|
||||
}
|
||||
}
|
||||
return shm_receive_only;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<class compressor>
|
||||
@ -1159,7 +1125,6 @@ public:
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
int shm_receive_only = 1;
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||
@ -1214,20 +1179,7 @@ public:
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
// shm == receive pointer if offnode
|
||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||
if ((shm==NULL)||Stencil_force_mpi) {
|
||||
shm = rp;
|
||||
// we found a packet that comes from MPI and contributes to this shift.
|
||||
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
|
||||
// Kernel will add the exterior_terms except if is_same_node.
|
||||
shm_receive_only = 0;
|
||||
// leg of stencil
|
||||
}
|
||||
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
||||
// assuming above pointer flip
|
||||
rpointers[i] = shm;
|
||||
rpointers[i] = rp;
|
||||
|
||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||
|
||||
@ -1239,102 +1191,17 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if ( shm_receive_only ) {
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM);
|
||||
} else {
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
}
|
||||
AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
|
||||
u_comm_offset +=buffer_size;
|
||||
}
|
||||
}
|
||||
return shm_receive_only;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ZeroCounters(void) {
|
||||
gathertime = 0.;
|
||||
commtime = 0.;
|
||||
mpi3synctime=0.;
|
||||
mpi3synctime_g=0.;
|
||||
shmmergetime=0.;
|
||||
for(int i=0;i<this->_npoints;i++){
|
||||
comm_time_thr[i]=0;
|
||||
comm_bytes_thr[i]=0;
|
||||
comm_enter_thr[i]=0;
|
||||
comm_leave_thr[i]=0;
|
||||
shm_bytes_thr[i]=0;
|
||||
}
|
||||
halogtime = 0.;
|
||||
mergetime = 0.;
|
||||
decompresstime = 0.;
|
||||
gathermtime = 0.;
|
||||
splicetime = 0.;
|
||||
nosplicetime = 0.;
|
||||
comms_bytes = 0.;
|
||||
shm_bytes = 0.;
|
||||
calls = 0.;
|
||||
};
|
||||
void ZeroCounters(void) { };
|
||||
|
||||
void Report(void) {
|
||||
#define AVERAGE(A)
|
||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
double t = 0;
|
||||
// if comm_time_thr is set they were all done in parallel so take the max
|
||||
// but add up the bytes
|
||||
int threaded = 0 ;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if ( comm_time_thr[i]>0.0 ) {
|
||||
threaded = 1;
|
||||
comms_bytes += comm_bytes_thr[i];
|
||||
shm_bytes += shm_bytes_thr[i];
|
||||
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||
}
|
||||
}
|
||||
if (threaded) commtime += t;
|
||||
|
||||
_grid->GlobalSum(commtime); commtime/=NP;
|
||||
if ( calls > 0. ) {
|
||||
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
|
||||
PRINTIT(halogtime);
|
||||
PRINTIT(gathertime);
|
||||
PRINTIT(gathermtime);
|
||||
PRINTIT(mergetime);
|
||||
PRINTIT(decompresstime);
|
||||
if(comms_bytes>1.0){
|
||||
PRINTIT(comms_bytes);
|
||||
PRINTIT(commtime);
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
if(shm_bytes>1.0){
|
||||
PRINTIT(shm_bytes); // X bytes + R bytes
|
||||
// Double this to include spin projection overhead with 2:1 ratio in wilson
|
||||
auto gatheralltime = gathertime+gathermtime;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
|
||||
auto all_bytes = comms_bytes+shm_bytes;
|
||||
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM all " << (all_bytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
|
||||
auto membytes = (shm_bytes + comms_bytes/2) // read/write
|
||||
+ (shm_bytes+comms_bytes)/2 * sizeof(vobj)/sizeof(cobj);
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000. << " GB/s per rank"<<std::endl;
|
||||
std::cout << GridLogMessage << " Stencil SHM mem " << (membytes)/gatheralltime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||
}
|
||||
/*
|
||||
PRINTIT(mpi3synctime);
|
||||
PRINTIT(mpi3synctime_g);
|
||||
PRINTIT(shmmergetime);
|
||||
PRINTIT(splicetime);
|
||||
PRINTIT(nosplicetime);
|
||||
*/
|
||||
}
|
||||
#undef PRINTIT
|
||||
#undef AVERAGE
|
||||
};
|
||||
void Report(void) { };
|
||||
|
||||
};
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -417,7 +417,7 @@ public:
|
||||
stream << "{";
|
||||
for (int j = 0; j < N; j++) {
|
||||
stream << o._internal[i][j];
|
||||
if (i < N - 1) stream << ",";
|
||||
if (j < N - 1) stream << ",";
|
||||
}
|
||||
stream << "}";
|
||||
if (i != N - 1) stream << "\n\t\t";
|
||||
|
@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
|
||||
class TypePair {
|
||||
public:
|
||||
T _internal[2];
|
||||
TypePair<T>& operator=(const Grid::Zero& o) {
|
||||
accelerator TypePair<T>& operator=(const Grid::Zero& o) {
|
||||
_internal[0] = Zero();
|
||||
_internal[1] = Zero();
|
||||
return *this;
|
||||
}
|
||||
|
||||
TypePair<T> operator+(const TypePair<T>& o) const {
|
||||
accelerator TypePair<T> operator+(const TypePair<T>& o) const {
|
||||
TypePair<T> r;
|
||||
r._internal[0] = _internal[0] + o._internal[0];
|
||||
r._internal[1] = _internal[1] + o._internal[1];
|
||||
return r;
|
||||
}
|
||||
|
||||
TypePair<T>& operator+=(const TypePair<T>& o) {
|
||||
accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
|
||||
_internal[0] += o._internal[0];
|
||||
_internal[1] += o._internal[1];
|
||||
return *this;
|
||||
|
@ -8,6 +8,7 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cudaDeviceProp *gpu_props;
|
||||
cudaStream_t copyStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
@ -73,29 +74,43 @@ void acceleratorInit(void)
|
||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||
}
|
||||
}
|
||||
|
||||
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
||||
#undef GPU_PROP_FMT
|
||||
#undef GPU_PROP
|
||||
|
||||
#ifdef GRID_DEFAULT_GPU
|
||||
int device = 0;
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorCudaInit: using default device \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses\n");
|
||||
printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n");
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
#else
|
||||
int device = rank;
|
||||
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||
printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n");
|
||||
cudaSetDevice(rank);
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
|
||||
#endif
|
||||
|
||||
cudaSetDevice(device);
|
||||
cudaStreamCreate(©Stream);
|
||||
const int len=64;
|
||||
char busid[len];
|
||||
if( rank == world_rank ) {
|
||||
cudaDeviceGetPCIBusId(busid, len, device);
|
||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||
}
|
||||
|
||||
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipDeviceProp_t *gpu_props;
|
||||
hipStream_t copyStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
@ -153,16 +168,25 @@ void acceleratorInit(void)
|
||||
#ifdef GRID_DEFAULT_GPU
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: using default device \n");
|
||||
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
|
||||
printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
int device = 0;
|
||||
#else
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
|
||||
}
|
||||
hipSetDevice(rank);
|
||||
int device = rank;
|
||||
#endif
|
||||
hipSetDevice(device);
|
||||
hipStreamCreate(©Stream);
|
||||
const int len=64;
|
||||
char busid[len];
|
||||
if( rank == world_rank ) {
|
||||
hipDeviceGetPCIBusId(busid, len, device);
|
||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||
}
|
||||
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
|
@ -95,6 +95,7 @@ void acceleratorInit(void);
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
@ -105,6 +106,7 @@ void acceleratorInit(void);
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern int acceleratorAbortOnGpuError;
|
||||
extern cudaStream_t copyStream;
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -114,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#endif
|
||||
} // CUDA specific
|
||||
|
||||
inline void cuda_mem(void)
|
||||
{
|
||||
size_t free_t,total_t,used_t;
|
||||
cudaMemGetInfo(&free_t,&total_t);
|
||||
used_t=total_t-free_t;
|
||||
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
|
||||
}
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
@ -213,9 +223,14 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
// int uvm=0;
|
||||
@ -271,7 +286,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
if(nt < 8)nt=8; \
|
||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
|
||||
cgh.parallel_for<class dslash>( \
|
||||
cgh.parallel_for( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||
[[intel::reqd_sub_group_size(8)]] \
|
||||
@ -289,7 +304,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
|
||||
theGridAccelerator->memcpy(to,from,bytes);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; }
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
|
||||
@ -320,10 +338,11 @@ NAMESPACE_BEGIN(Grid);
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern hipStream_t copyStream;
|
||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
return hipThreadIdx_z;
|
||||
return hipThreadIdx_x;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@ -337,19 +356,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
{ __VA_ARGS__;} \
|
||||
}; \
|
||||
int nt=acceleratorThreads(); \
|
||||
dim3 hip_threads(nt,1,nsimd); \
|
||||
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||
0,0, \
|
||||
num1,num2,nsimd,lambda); \
|
||||
dim3 hip_threads(nsimd, nt, 1); \
|
||||
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
|
||||
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
|
||||
0,0, \
|
||||
num1,num2,nsimd, lambda); \
|
||||
} else { \
|
||||
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
|
||||
0,0, \
|
||||
num1,num2,nsimd, lambda); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
template<typename lambda> __global__
|
||||
__launch_bounds__(64,1)
|
||||
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||
{
|
||||
// Following the same scheme as CUDA for now
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < numx) && (y<numy) && (z<numz) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
__launch_bounds__(1024,1)
|
||||
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||
{
|
||||
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
|
||||
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
|
||||
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
|
||||
// Following the same scheme as CUDA for now
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < numx) && (y<numy) && (z<numz) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
@ -394,9 +435,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
|
||||
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -435,7 +483,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopySynchronise(void) {};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||
@ -466,18 +515,12 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
///////////////////////////////////////////////////
|
||||
// Synchronise across local threads for divergence resynch
|
||||
///////////////////////////////////////////////////
|
||||
accelerator_inline void acceleratorSynchronise(void)
|
||||
accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs
|
||||
{
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
__syncwarp();
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//cl::sycl::detail::workGroupBarrier();
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
__syncthreads();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ public:
|
||||
// Coordinate class, maxdims = 8 for now.
|
||||
////////////////////////////////////////////////////////////////
|
||||
#define GRID_MAX_LATTICE_DIMENSION (8)
|
||||
#define GRID_MAX_SIMD (16)
|
||||
#define GRID_MAX_SIMD (32)
|
||||
|
||||
static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;
|
||||
|
||||
|
@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
|
||||
return;
|
||||
}
|
||||
|
||||
void GridCmdOptionFloat(std::string &str,float & val)
|
||||
{
|
||||
std::stringstream ss(str);
|
||||
ss>>val;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void GridParseLayout(char **argv,int argc,
|
||||
Coordinate &latt_c,
|
||||
@ -301,6 +308,13 @@ void Grid_init(int *argc,char ***argv)
|
||||
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){
|
||||
int forcempi;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi");
|
||||
GridCmdOptionInt(arg,forcempi);
|
||||
Stencil_force_mpi = (bool)forcempi;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
|
||||
int MB;
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
|
||||
@ -419,7 +433,9 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-mpi 0|1 : Force MPI usage under multi-rank per node "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
@ -518,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
void Grid_finalize(void)
|
||||
{
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Finalize();
|
||||
Grid_unquiesce_nodes();
|
||||
#endif
|
||||
|
@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
|
||||
template<class VectorInt>
|
||||
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
|
||||
void GridCmdOptionInt(std::string &str,int & val);
|
||||
void GridCmdOptionFloat(std::string &str,float & val);
|
||||
|
||||
|
||||
void GridParseLayout(char **argv,int argc,
|
||||
|
@ -137,7 +137,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
|
||||
Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
|
||||
double n = BENCH_IO_NPASS;
|
||||
// double n = BENCH_IO_NPASS;
|
||||
|
||||
stats(mean, stdDev, perf);
|
||||
stats(avMean, avStdDev, avPerf);
|
||||
@ -164,7 +164,7 @@ int main (int argc, char ** argv)
|
||||
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
|
||||
}
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
|
||||
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%4s %12s %12s %12s %12s\n",
|
||||
"L", "std read", "std write", "Grid read", "Grid write");
|
||||
@ -185,7 +185,7 @@ int main (int argc, char ** argv)
|
||||
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
|
||||
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
|
||||
MSG << std::endl;
|
||||
MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
|
||||
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
|
||||
MSG << std::endl;
|
||||
grid_printf("%12s %12s %12s %12s\n",
|
||||
"std read", "std write", "Grid read", "Grid write");
|
||||
|
@ -142,7 +142,7 @@ public:
|
||||
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
// int ncomm;
|
||||
double dbytes;
|
||||
|
||||
for(int dir=0;dir<8;dir++) {
|
||||
@ -290,7 +290,7 @@ public:
|
||||
LatticeSU4 z(&Grid); z=Zero();
|
||||
LatticeSU4 x(&Grid); x=Zero();
|
||||
LatticeSU4 y(&Grid); y=Zero();
|
||||
double a=2.0;
|
||||
// double a=2.0;
|
||||
|
||||
uint64_t Nloop=NLOOP;
|
||||
|
||||
|
@ -53,7 +53,7 @@ struct time_statistics{
|
||||
|
||||
void header(){
|
||||
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
|
||||
<<std::setw(11)<<"bytes\t\t"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
|
||||
<<std::setw(11)<<"bytes\t\t"<<"MB/s uni"<<"\t"<<"MB/s bidi"<<std::endl;
|
||||
};
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
@ -72,7 +72,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
|
||||
std::vector<double> t_time(Nloop);
|
||||
time_statistics timestat;
|
||||
// time_statistics timestat;
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;
|
||||
|
@ -126,19 +126,10 @@ int main (int argc, char ** argv)
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4,FGrid);
|
||||
{
|
||||
autoView( Umu5d_v, Umu5d, CpuWrite);
|
||||
autoView( Umu_v , Umu , CpuRead);
|
||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||
}
|
||||
}
|
||||
}
|
||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
@ -147,10 +138,28 @@ int main (int argc, char ** argv)
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
@ -182,7 +191,7 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
int ncall =3000;
|
||||
int ncall =300;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
@ -242,16 +251,30 @@ int main (int argc, char ** argv)
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
for(int i=0;i<ref_v.size();i++){
|
||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
int i=s+Ls*ss;
|
||||
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
|
@ -184,8 +184,10 @@ int main (int argc, char ** argv)
|
||||
|
||||
double bytes=1.0*vol*Nvec*sizeof(Real);
|
||||
double flops=vol*Nvec*2;// mul,add
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"
|
||||
<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"
|
||||
<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
|
||||
assert(nn==nn);
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
|
@ -390,6 +390,7 @@ case ${CXXTEST} in
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||
if test $ac_openmp = yes; then
|
||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
|
||||
LDFLAGS="$LDFLAGS -Xcompiler -fopenmp"
|
||||
fi
|
||||
;;
|
||||
hipcc)
|
||||
|
Binary file not shown.
@ -1787,7 +1787,7 @@ Hdf5Writer Hdf5Reader HDF5
|
||||
|
||||
Write interfaces, similar to the XML facilities in QDP++ are presented. However,
|
||||
the serialisation routines are automatically generated by the macro, and a virtual
|
||||
reader adn writer interface enables writing to any of a number of formats.
|
||||
reader and writer interface enables writing to any of a number of formats.
|
||||
|
||||
**Example**::
|
||||
|
||||
@ -1814,6 +1814,91 @@ reader adn writer interface enables writing to any of a number of formats.
|
||||
}
|
||||
|
||||
|
||||
Eigen tensor support -- added 2019H1
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The Serialisation library was expanded in 2019 to support de/serialisation of
|
||||
Eigen tensors. De/serialisation of existing types was not changed. Data files
|
||||
without Eigen tensors remain compatible with earlier versions of Grid and other readers.
|
||||
Conversely, data files containing serialised Eigen tensors is a breaking change.
|
||||
|
||||
Eigen tensor serialisation support was added to BaseIO, which was modified to provide a Traits class
|
||||
to recognise Eigen tensors with elements that are either: primitive scalars (arithmetic and complex types);
|
||||
or Grid tensors.
|
||||
|
||||
**Traits determining de/serialisable scalars**::
|
||||
|
||||
// Is this an Eigen tensor
|
||||
template<typename T> struct is_tensor : std::integral_constant<bool,
|
||||
std::is_base_of<Eigen::TensorBase<T, Eigen::ReadOnlyAccessors>, T>::value> {};
|
||||
// Is this an Eigen tensor of a supported scalar
|
||||
template<typename T, typename V = void> struct is_tensor_of_scalar : public std::false_type {};
|
||||
template<typename T> struct is_tensor_of_scalar<T, typename std::enable_if<is_tensor<T>::value && is_scalar<typename T::Scalar>::value>::type> : public std::true_type {};
|
||||
// Is this an Eigen tensor of a supported container
|
||||
template<typename T, typename V = void> struct is_tensor_of_container : public std::false_type {};
|
||||
template<typename T> struct is_tensor_of_container<T, typename std::enable_if<is_tensor<T>::value && isGridTensor<typename T::Scalar>::value>::type> : public std::true_type {};
|
||||
|
||||
|
||||
Eigen tensors are regular, multidimensional objects, and each Reader/Writer
|
||||
was extended to support this new datatype. Where the Eigen tensor contains
|
||||
a Grid tensor, the dimensions of the data written are the dimensions of the
|
||||
Eigen tensor plus the dimensions of the underlying Grid scalar. Dimensions
|
||||
of size 1 are preserved.
|
||||
|
||||
**New Reader/Writer methods for multi-dimensional data**::
|
||||
|
||||
template <typename U>
|
||||
void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
|
||||
template <typename U>
|
||||
void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
|
||||
|
||||
|
||||
On readback, the Eigen tensor rank must match the data being read, but the tensor
|
||||
dimensions will be resized if necessary. Resizing is not possible for Eigen::TensorMap<T>
|
||||
because these tensors use a buffer provided at construction, and this buffer cannot be changed.
|
||||
Deserialisation failures cause Grid to assert.
|
||||
|
||||
|
||||
HDF5 Optimisations -- added June 2021
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Grid serialisation is intended to be light, deterministic and provide a layer of abstraction over
|
||||
multiple file formats. HDF5 excels at handling multi-dimensional data, and the Grid HDF5Reader/HDF5Writer exploits this.
|
||||
When serialising nested ``std::vector<T>``, where ``T`` is an arithmetic or complex type,
|
||||
the Hdf5Writer writes the data as an Hdf5 DataSet object.
|
||||
|
||||
However, nested ``std::vector<std::vector<...T>>`` might be "ragged", i.e. not necessarily regular. E.g. a 3d nested
|
||||
``std::vector`` might contain 2 rows, the first being a 2x2 block and the second row being a 1 x 2 block.
|
||||
A bug existed whereby this was not checked on write, so nested, ragged vectors
|
||||
were written as a regular dataset, with a buffer under/overrun and jumbled contents.
|
||||
|
||||
Clearly this was not used in production, as the bug went undetected until now. Fixing this bug
|
||||
is an opportunity to further optimise the HDF5 file format.
|
||||
|
||||
The goals of this change are to:
|
||||
|
||||
* Make changes to the Hdf5 file format only -- i.e. do not impact other file formats
|
||||
|
||||
* Implement file format changes in such a way that they are transparent to the Grid reader
|
||||
|
||||
* Correct the bug for ragged vectors of numeric / complex types
|
||||
|
||||
* Extend the support of nested std::vector<T> to arbitrarily nested Grid tensors
|
||||
|
||||
|
||||
The trait class ``element`` has been redefined to ``is_flattenable``, which is a trait class for
|
||||
potentially "flattenable" objects. These are (possibly nested) ``std::vector<T>`` where ``T`` is
|
||||
an arithmetic, complex or Grid tensor type. Flattenable objects are tested on write
|
||||
(with the function ``isRegularShape``) to see whether they actually are regular.
|
||||
|
||||
Flattenable, regular objects are written to a multidimensional HDF5 DataSet.
|
||||
Otherwise, an Hdf5 sub group is created with the object "name", and each element of the outer dimension is
|
||||
recursively written to as object "name_n", where n is a 0-indexed number.
|
||||
|
||||
On readback (by Grid)), the presence of a subgroup containing the attribute ``Grid_vector_size`` triggers a
|
||||
"ragged read", otherwise a read from a DataSet is attempted.
|
||||
|
||||
|
||||
Data parallel field IO
|
||||
-----------------------
|
||||
|
||||
|
@ -4,7 +4,7 @@ using namespace Grid;
|
||||
template<class Field>
|
||||
void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
|
||||
{
|
||||
RealD cp, c, alpha, d, beta, ssq, qq;
|
||||
RealD cp, c, alpha, d, beta, ssq;
|
||||
RealD Tolerance=1.0e-10;
|
||||
int MaxIterations=10000;
|
||||
|
||||
|
539
examples/Example_wall_wall_3pt.cc
Normal file
539
examples/Example_wall_wall_3pt.cc
Normal file
@ -0,0 +1,539 @@
|
||||
/*
|
||||
* Warning: This code illustrative only: not well tested, and not meant for production use
|
||||
* without regression / tests being applied
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
typedef SpinColourMatrix Propagator;
|
||||
typedef SpinColourVector Fermion;
|
||||
typedef PeriodicGimplR GimplR;
|
||||
|
||||
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
GridBase *grid;
|
||||
GaugeField U;
|
||||
|
||||
CovariantLaplacianCshift(GaugeField &_U) :
|
||||
grid(_U.Grid()),
|
||||
U(_U) { };
|
||||
|
||||
virtual GridBase *Grid(void) { return grid; };
|
||||
|
||||
virtual void M (const Field &in, Field &out)
|
||||
{
|
||||
out=Zero();
|
||||
for(int mu=0;mu<Nd-1;mu++) {
|
||||
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
|
||||
out = out - Gimpl::CovShiftForward(Umu,mu,in);
|
||||
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
|
||||
out = out + 2.0*in;
|
||||
}
|
||||
};
|
||||
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
|
||||
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
};
|
||||
|
||||
void MakePhase(Coordinate mom,LatticeComplex &phase)
|
||||
{
|
||||
GridBase *grid = phase.Grid();
|
||||
auto latt_size = grid->GlobalDimensions();
|
||||
ComplexD ci(0.0,1.0);
|
||||
phase=Zero();
|
||||
|
||||
LatticeComplex coor(phase.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
LatticeCoordinate(coor,mu);
|
||||
phase = phase + (TwoPiL * mom[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase*ci);
|
||||
}
|
||||
void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
|
||||
{
|
||||
Smear_Stout<GimplR> Stout(rho);
|
||||
LatticeGaugeField Utmp(Uin.Grid());
|
||||
Utmp = Uin;
|
||||
for(int i=0;i<nstep;i++){
|
||||
Stout.smear(Usmr,Utmp);
|
||||
Utmp = Usmr;
|
||||
}
|
||||
}
|
||||
void PointSource(Coordinate &coor,LatticePropagator &source)
|
||||
{
|
||||
// Coordinate coor({0,0,0,0});
|
||||
source=Zero();
|
||||
SpinColourMatrix kronecker; kronecker=1.0;
|
||||
pokeSite(kronecker,source,coor);
|
||||
}
|
||||
void GFWallSource(int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex one(grid); one = ComplexD(1.0,0.0);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
LatticeCoordinate(t,Tdir);
|
||||
one = where(t==Integer(tslice), one, zz);
|
||||
source = 1.0;
|
||||
source = source * one;
|
||||
}
|
||||
|
||||
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex noise(grid);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
|
||||
RealD nrm=1.0/sqrt(2);
|
||||
bernoulli(RNG, noise); // 0,1 50:50
|
||||
|
||||
noise = (2.*noise - Complex(1,1))*nrm;
|
||||
|
||||
LatticeCoordinate(t,Tdir);
|
||||
noise = where(t==Integer(tslice), noise, zz);
|
||||
|
||||
source = 1.0;
|
||||
source = source*noise;
|
||||
std::cout << " Z2 wall " << norm2(source) << std::endl;
|
||||
}
|
||||
void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
|
||||
{
|
||||
Real alpha=0.05;
|
||||
|
||||
Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
|
||||
|
||||
std::cout << " Initial plaquette "<<plaq << std::endl;
|
||||
|
||||
LatticeColourMatrix xform(U.Grid());
|
||||
Ufix = U;
|
||||
int orthog=Nd-1;
|
||||
FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
|
||||
|
||||
plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
|
||||
|
||||
std::cout << " Final plaquette "<<plaq << std::endl;
|
||||
}
|
||||
template<class Field>
|
||||
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
|
||||
{
|
||||
typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
|
||||
Laplacian_t Laplacian(U);
|
||||
|
||||
Integer Iterations = 40;
|
||||
Real width = 2.0;
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
Field tmp(U.Grid());
|
||||
smeared=unsmeared;
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(smeared,tmp);
|
||||
smeared = smeared - coeff*tmp;
|
||||
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
|
||||
}
|
||||
}
|
||||
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
LatticePropagator tmp(source.Grid());
|
||||
PointSource(site,source);
|
||||
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
|
||||
tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
|
||||
}
|
||||
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
Z2WallSource(RNG,tslice,source);
|
||||
auto tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
}
|
||||
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
|
||||
{
|
||||
assert(mom.size()==Nd);
|
||||
assert(mom[Tdir] == 0);
|
||||
|
||||
GridBase * grid = spectator.Grid();
|
||||
|
||||
LatticeInteger ts(grid);
|
||||
LatticeCoordinate(ts,Tdir);
|
||||
source = Zero();
|
||||
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
|
||||
|
||||
LatticeComplex phase(grid);
|
||||
MakePhase(mom,phase);
|
||||
|
||||
source = source *phase;
|
||||
}
|
||||
template<class Action>
|
||||
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = D.GaugeGrid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion src5 (FGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion result4(UGrid);
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
|
||||
SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
|
||||
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
|
||||
result5=Zero();
|
||||
schur(D,src5,result5,ZG);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MesonFile: Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
|
||||
};
|
||||
|
||||
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
void Meson3pt(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaX},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaY},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaZ},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaT}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
int nt=q1.size();
|
||||
std::vector<Complex> meson_CF(nt);
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
|
||||
corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
int make_idx(int p, int m,int nmom)
|
||||
{
|
||||
if (m==0) return p;
|
||||
assert(p==0);
|
||||
return nmom + m - 1;
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
// Double precision grids
|
||||
auto latt = GridDefaultLatt();
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
LatticeGaugeField Utmp(UGrid);
|
||||
LatticeGaugeField Usmr(UGrid);
|
||||
std::string config;
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
config=argv[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
config="ColdConfig";
|
||||
}
|
||||
// GaugeFix(Umu,Utmp);
|
||||
// Umu=Utmp;
|
||||
|
||||
int nsmr=3;
|
||||
RealD rho=0.1;
|
||||
LinkSmear(nsmr,rho,Umu,Usmr);
|
||||
|
||||
|
||||
std::vector<int> smeared_link({ 0,0,1} );
|
||||
std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
|
||||
std::vector<RealD> M5s ({ 1.8,1.8,1.0} );
|
||||
std::vector<RealD> bs ({ 1.0,1.0,1.5} ); // DDM
|
||||
std::vector<RealD> cs ({ 0.0,0.0,0.5} ); // DDM
|
||||
std::vector<int> Ls_s ({ 16,16,12} );
|
||||
std::vector<GridCartesian *> FGrids;
|
||||
std::vector<GridRedBlackCartesian *> FrbGrids;
|
||||
|
||||
std::vector<Coordinate> momenta;
|
||||
momenta.push_back(Coordinate({0,0,0,0}));
|
||||
momenta.push_back(Coordinate({1,0,0,0}));
|
||||
momenta.push_back(Coordinate({2,0,0,0}));
|
||||
|
||||
int nmass = masses.size();
|
||||
int nmom = momenta.size();
|
||||
|
||||
std::vector<MobiusFermionR *> FermActs;
|
||||
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
typedef MobiusFermionR FermionAction;
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
|
||||
for(int m=0;m<masses.size();m++) {
|
||||
|
||||
RealD mass = masses[m];
|
||||
RealD M5 = M5s[m];
|
||||
RealD b = bs[m];
|
||||
RealD c = cs[m];
|
||||
int Ls = Ls_s[m];
|
||||
|
||||
if ( smeared_link[m] ) Utmp = Usmr;
|
||||
else Utmp = Umu;
|
||||
|
||||
FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
|
||||
FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
|
||||
|
||||
FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
|
||||
}
|
||||
|
||||
LatticePropagator z2wall_source(UGrid);
|
||||
LatticePropagator gfwall_source(UGrid);
|
||||
LatticePropagator phased_prop(UGrid);
|
||||
|
||||
int tslice = 0;
|
||||
int tseq=(tslice+16)%latt[Nd-1];
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// RNG seeded for Z2 wall
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// You can manage seeds however you like.
|
||||
// Recommend SeedUniqueString.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
|
||||
Z2WallSource (RNG4,tslice,z2wall_source);
|
||||
GFWallSource (tslice,gfwall_source);
|
||||
|
||||
std::vector<LatticeComplex> phase(nmom,UGrid);
|
||||
for(int m=0;m<nmom;m++){
|
||||
MakePhase(momenta[m],phase[m]);
|
||||
}
|
||||
|
||||
std::vector<LatticePropagator> Z2Props (nmom+nmass-1,UGrid);
|
||||
std::vector<LatticePropagator> GFProps (nmom+nmass-1,UGrid);
|
||||
for(int p=0;p<nmom;p++) {
|
||||
int m=0;
|
||||
int idx = make_idx(p,m,nmom);
|
||||
phased_prop = z2wall_source * phase[p];
|
||||
Solve(*FermActs[m],phased_prop ,Z2Props[idx]);
|
||||
|
||||
phased_prop = gfwall_source * phase[p];
|
||||
Solve(*FermActs[m],phased_prop ,GFProps[idx]);
|
||||
}
|
||||
for(int m=1;m<nmass;m++) {
|
||||
int p=0;
|
||||
int idx = make_idx(p,m,nmom);
|
||||
phased_prop = z2wall_source;
|
||||
Solve(*FermActs[m],phased_prop ,Z2Props[idx]);
|
||||
|
||||
phased_prop = gfwall_source;
|
||||
Solve(*FermActs[m],phased_prop ,GFProps[idx]);
|
||||
}
|
||||
|
||||
std::vector<std::vector<Propagator> > wsnk_z2Props(nmom+nmass-1);
|
||||
std::vector<std::vector<Propagator> > wsnk_gfProps(nmom+nmass-1);
|
||||
|
||||
// Non-zero kaon and point and D two point
|
||||
// WW stick momentum on m1 (lighter)
|
||||
// zero momentum on m2
|
||||
for(int m1=0;m1<nmass;m1++) {
|
||||
for(int m2=m1;m2<nmass;m2++) {
|
||||
int pmax = (m1==0)? nmom:1;
|
||||
for(int p=0;p<pmax;p++){
|
||||
|
||||
std::stringstream ssg,ssz;
|
||||
std::stringstream wssg,wssz;
|
||||
|
||||
int idx1 = make_idx(p,m1,nmom);
|
||||
int idx2 = make_idx(0,m2,nmom);
|
||||
|
||||
/// Point sinks
|
||||
ssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
|
||||
ssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
|
||||
MesonTrace(ssz.str(),Z2Props[idx1],Z2Props[idx2],phase[p]); // Q1 is conjugated
|
||||
MesonTrace(ssg.str(),GFProps[idx1],GFProps[idx2],phase[p]);
|
||||
|
||||
/// Wall sinks
|
||||
wssg<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
|
||||
wssz<<config<<"_p"<<p<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
|
||||
|
||||
phased_prop = GFProps[m2] * phase[p];
|
||||
sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
|
||||
sliceSum(GFProps[m1],wsnk_gfProps[m2],Tdir);
|
||||
WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
|
||||
|
||||
phased_prop = Z2Props[m2] * phase[p];
|
||||
sliceSum(phased_prop,wsnk_gfProps[m1],Tdir);
|
||||
sliceSum(Z2Props[m1],wsnk_gfProps[m2],Tdir);
|
||||
WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
|
||||
}
|
||||
}}
|
||||
|
||||
|
||||
/////////////////////////////////////
|
||||
// Sequential solves
|
||||
/////////////////////////////////////
|
||||
LatticePropagator seq_wsnk_z2src(UGrid);
|
||||
LatticePropagator seq_wsnk_gfsrc(UGrid);
|
||||
LatticePropagator seq_psnk_z2src(UGrid);
|
||||
LatticePropagator seq_psnk_gfsrc(UGrid);
|
||||
LatticePropagator source(UGrid);
|
||||
for(int m=0;m<nmass-1;m++){
|
||||
int spect_idx = make_idx(0,m,nmom);
|
||||
int charm=nmass-1;
|
||||
|
||||
SequentialSource(tseq,momenta[0],GFProps[spect_idx],source);
|
||||
Solve(*FermActs[charm],source,seq_psnk_gfsrc);
|
||||
|
||||
SequentialSource(tseq,momenta[0],Z2Props[spect_idx],source);
|
||||
Solve(*FermActs[charm],source,seq_psnk_z2src);
|
||||
|
||||
// Todo need wall sequential solve
|
||||
for(int p=0;p<nmom;p++){
|
||||
int active_idx = make_idx(p,0,nmom);
|
||||
std::stringstream seq_3pt_p_z2;
|
||||
std::stringstream seq_3pt_p_gf;
|
||||
std::stringstream seq_3pt_w_z2;
|
||||
std::stringstream seq_3pt_w_gf;
|
||||
seq_3pt_p_z2 <<config<<"_3pt_p"<<p<< "_m" << m << "_p_z2_meson.xml";
|
||||
seq_3pt_p_gf <<config<<"_3pt_p"<<p<< "_m" << m << "_p_gf_meson.xml";
|
||||
seq_3pt_w_z2 <<config<<"_3pt_p"<<p<< "_m" << m << "_w_z2_meson.xml";
|
||||
seq_3pt_w_gf <<config<<"_3pt_p"<<p<< "_m" << m << "_w_gf_meson.xml";
|
||||
Meson3pt(seq_3pt_p_gf.str(),GFProps[active_idx],seq_psnk_gfsrc,phase[p]);
|
||||
Meson3pt(seq_3pt_p_z2.str(),Z2Props[active_idx],seq_psnk_z2src,phase[p]);
|
||||
}
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
||||
|
433
examples/Example_wall_wall_spectrum.cc
Normal file
433
examples/Example_wall_wall_spectrum.cc
Normal file
@ -0,0 +1,433 @@
|
||||
/*
|
||||
* Warning: This code illustrative only: not well tested, and not meant for production use
|
||||
* without regression / tests being applied
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
typedef SpinColourMatrix Propagator;
|
||||
typedef SpinColourVector Fermion;
|
||||
typedef PeriodicGimplR GimplR;
|
||||
|
||||
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
GridBase *grid;
|
||||
GaugeField U;
|
||||
|
||||
CovariantLaplacianCshift(GaugeField &_U) :
|
||||
grid(_U.Grid()),
|
||||
U(_U) { };
|
||||
|
||||
virtual GridBase *Grid(void) { return grid; };
|
||||
|
||||
virtual void M (const Field &in, Field &out)
|
||||
{
|
||||
out=Zero();
|
||||
for(int mu=0;mu<Nd-1;mu++) {
|
||||
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
|
||||
out = out - Gimpl::CovShiftForward(Umu,mu,in);
|
||||
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
|
||||
out = out + 2.0*in;
|
||||
}
|
||||
};
|
||||
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
|
||||
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
};
|
||||
|
||||
void MakePhase(Coordinate mom,LatticeComplex &phase)
|
||||
{
|
||||
GridBase *grid = phase.Grid();
|
||||
auto latt_size = grid->GlobalDimensions();
|
||||
ComplexD ci(0.0,1.0);
|
||||
phase=Zero();
|
||||
|
||||
LatticeComplex coor(phase.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
LatticeCoordinate(coor,mu);
|
||||
phase = phase + (TwoPiL * mom[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase*ci);
|
||||
}
|
||||
void LinkSmear(int nstep, RealD rho,LatticeGaugeField &Uin,LatticeGaugeField &Usmr)
|
||||
{
|
||||
Smear_Stout<GimplR> Stout(rho);
|
||||
LatticeGaugeField Utmp(Uin.Grid());
|
||||
Utmp = Uin;
|
||||
for(int i=0;i<nstep;i++){
|
||||
Stout.smear(Usmr,Utmp);
|
||||
Utmp = Usmr;
|
||||
}
|
||||
}
|
||||
void PointSource(Coordinate &coor,LatticePropagator &source)
|
||||
{
|
||||
// Coordinate coor({0,0,0,0});
|
||||
source=Zero();
|
||||
SpinColourMatrix kronecker; kronecker=1.0;
|
||||
pokeSite(kronecker,source,coor);
|
||||
}
|
||||
void GFWallSource(int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex one(grid); one = ComplexD(1.0,0.0);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
LatticeCoordinate(t,Tdir);
|
||||
one = where(t==Integer(tslice), one, zz);
|
||||
source = 1.0;
|
||||
source = source * one;
|
||||
}
|
||||
|
||||
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex noise(grid);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
|
||||
RealD nrm=1.0/sqrt(2);
|
||||
bernoulli(RNG, noise); // 0,1 50:50
|
||||
|
||||
noise = (2.*noise - Complex(1,1))*nrm;
|
||||
|
||||
LatticeCoordinate(t,Tdir);
|
||||
noise = where(t==Integer(tslice), noise, zz);
|
||||
|
||||
source = 1.0;
|
||||
source = source*noise;
|
||||
std::cout << " Z2 wall " << norm2(source) << std::endl;
|
||||
}
|
||||
void GaugeFix(LatticeGaugeField &U,LatticeGaugeField &Ufix)
|
||||
{
|
||||
Real alpha=0.05;
|
||||
|
||||
Real plaq=WilsonLoops<GimplR>::avgPlaquette(U);
|
||||
|
||||
std::cout << " Initial plaquette "<<plaq << std::endl;
|
||||
|
||||
LatticeColourMatrix xform(U.Grid());
|
||||
Ufix = U;
|
||||
int orthog=Nd-1;
|
||||
FourierAcceleratedGaugeFixer<GimplR>::SteepestDescentGaugeFix(Ufix,xform,alpha,100000,1.0e-14, 1.0e-14,true,orthog);
|
||||
|
||||
plaq=WilsonLoops<GimplR>::avgPlaquette(Ufix);
|
||||
|
||||
std::cout << " Final plaquette "<<plaq << std::endl;
|
||||
}
|
||||
template<class Field>
|
||||
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
|
||||
{
|
||||
typedef CovariantLaplacianCshift <GimplR,Field> Laplacian_t;
|
||||
Laplacian_t Laplacian(U);
|
||||
|
||||
Integer Iterations = 40;
|
||||
Real width = 2.0;
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
Field tmp(U.Grid());
|
||||
smeared=unsmeared;
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(smeared,tmp);
|
||||
smeared = smeared - coeff*tmp;
|
||||
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
|
||||
}
|
||||
}
|
||||
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
LatticePropagator tmp(source.Grid());
|
||||
PointSource(site,source);
|
||||
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
|
||||
tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
|
||||
}
|
||||
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
Z2WallSource(RNG,tslice,source);
|
||||
auto tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
}
|
||||
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
|
||||
{
|
||||
assert(mom.size()==Nd);
|
||||
assert(mom[Tdir] == 0);
|
||||
|
||||
GridBase * grid = spectator.Grid();
|
||||
|
||||
LatticeInteger ts(grid);
|
||||
LatticeCoordinate(ts,Tdir);
|
||||
source = Zero();
|
||||
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
|
||||
|
||||
LatticeComplex phase(grid);
|
||||
MakePhase(mom,phase);
|
||||
|
||||
source = source *phase;
|
||||
}
|
||||
template<class Action>
|
||||
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = D.GaugeGrid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
|
||||
LatticeFermion src4 (UGrid); src4 = Zero();
|
||||
LatticeFermion src5 (FGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion result4(UGrid);
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-12,100000);
|
||||
SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
|
||||
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
|
||||
std::cout<<GridLogMessage<< " source4 "<<norm2(source)<<std::endl;
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
std::cout<<GridLogMessage<< s<<c<<" src4 "<<norm2(src4)<<std::endl;
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
std::cout<<GridLogMessage<< s<<c<<" src5 "<<norm2(src5)<<std::endl;
|
||||
|
||||
result5=Zero();
|
||||
schur(D,src5,result5,ZG);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MesonFile: Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
|
||||
};
|
||||
|
||||
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void WallSinkMesonTrace(std::string file,std::vector<Propagator> &q1,std::vector<Propagator> &q2)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
|
||||
{Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
|
||||
{Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
int nt=q1.size();
|
||||
std::vector<Complex> meson_CF(nt);
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
meson_CF[t] = trace(G5*adj(q1[t])*G5*Gsnk*q2[t]*adj(Gsrc));
|
||||
corr[t] = TensorRemove(meson_CF[t]); // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
// Double precision grids
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
LatticeGaugeField Utmp(UGrid);
|
||||
LatticeGaugeField Usmr(UGrid);
|
||||
std::string config;
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
config=argv[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
config="ColdConfig";
|
||||
}
|
||||
// GaugeFix(Umu,Utmp);
|
||||
// Umu=Utmp;
|
||||
|
||||
int nsmr=3;
|
||||
RealD rho=0.1;
|
||||
RealD plaq_gf =WilsonLoops<GimplR>::avgPlaquette(Umu);
|
||||
LinkSmear(nsmr,rho,Umu,Usmr);
|
||||
RealD plaq_smr=WilsonLoops<GimplR>::avgPlaquette(Usmr);
|
||||
std::cout << GridLogMessage << " GF Plaquette " <<plaq_gf<<std::endl;
|
||||
std::cout << GridLogMessage << " SM Plaquette " <<plaq_smr<<std::endl;
|
||||
|
||||
std::vector<int> smeared_link({ 0,0,1} );
|
||||
std::vector<RealD> masses({ 0.004,0.02477,0.447} ); // u/d, s, c ??
|
||||
std::vector<RealD> M5s ({ 1.8,1.8,1.0} );
|
||||
std::vector<RealD> bs ({ 1.0,1.0,1.5} ); // DDM
|
||||
std::vector<RealD> cs ({ 0.0,0.0,0.5} ); // DDM
|
||||
std::vector<int> Ls_s ({ 16,16,12} );
|
||||
std::vector<GridCartesian *> FGrids;
|
||||
std::vector<GridRedBlackCartesian *> FrbGrids;
|
||||
|
||||
int nmass = masses.size();
|
||||
|
||||
std::vector<MobiusFermionR *> FermActs;
|
||||
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
typedef MobiusFermionR FermionAction;
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
|
||||
for(int m=0;m<masses.size();m++) {
|
||||
|
||||
RealD mass = masses[m];
|
||||
RealD M5 = M5s[m];
|
||||
RealD b = bs[m];
|
||||
RealD c = cs[m];
|
||||
int Ls = Ls_s[m];
|
||||
|
||||
if ( smeared_link[m] ) Utmp = Usmr;
|
||||
else Utmp = Umu;
|
||||
|
||||
FGrids.push_back(SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid));
|
||||
FrbGrids.push_back(SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid));
|
||||
|
||||
FermActs.push_back(new MobiusFermionR(Utmp,*FGrids[m],*FrbGrids[m],*UGrid,*UrbGrid,mass,M5,b,c,Params));
|
||||
}
|
||||
|
||||
LatticePropagator z2wall_source(UGrid);
|
||||
LatticePropagator gfwall_source(UGrid);
|
||||
|
||||
int tslice = 0;
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// RNG seeded for Z2 wall
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// You can manage seeds however you like.
|
||||
// Recommend SeedUniqueString.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString("Study2-Source_Z2_p_0_0_0_t_0-880");
|
||||
Z2WallSource (RNG4,tslice,z2wall_source);
|
||||
GFWallSource (tslice,gfwall_source);
|
||||
|
||||
std::vector<LatticePropagator> Z2Props (nmass,UGrid);
|
||||
std::vector<LatticePropagator> GFProps (nmass,UGrid);
|
||||
|
||||
for(int m=0;m<nmass;m++) {
|
||||
|
||||
std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<<std::endl;
|
||||
Solve(*FermActs[m],z2wall_source ,Z2Props[m]);
|
||||
std::cout << GridLogMessage << " Mass " <<m << " gfwall source "<<norm2(gfwall_source)<<std::endl;
|
||||
Solve(*FermActs[m],gfwall_source ,GFProps[m]);
|
||||
|
||||
std::cout << GridLogMessage << " Mass " <<m << " z2wall source "<<norm2(z2wall_source)<< " " << norm2(gfwall_source)<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
LatticeComplex phase(UGrid);
|
||||
Coordinate mom({0,0,0,0});
|
||||
MakePhase(mom,phase);
|
||||
|
||||
std::vector<std::vector<Propagator> > wsnk_z2Props(nmass);
|
||||
std::vector<std::vector<Propagator> > wsnk_gfProps(nmass);
|
||||
for(int m=0;m<nmass;m++){
|
||||
sliceSum(Z2Props[m],wsnk_z2Props[m],Tdir);
|
||||
sliceSum(GFProps[m],wsnk_gfProps[m],Tdir);
|
||||
}
|
||||
|
||||
for(int m1=0 ;m1<nmass;m1++) {
|
||||
for(int m2=m1;m2<nmass;m2++) {
|
||||
std::stringstream ssg,ssz;
|
||||
std::stringstream wssg,wssz;
|
||||
|
||||
/// Point sinks
|
||||
ssg<<config<< "_m" << m1 << "_m"<< m2 << "_p_gf_meson.xml";
|
||||
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_p_z2_meson.xml";
|
||||
|
||||
MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
|
||||
MesonTrace(ssg.str(),GFProps[m1],GFProps[m2],phase);
|
||||
|
||||
/// Wall sinks
|
||||
wssg<<config<< "_m" << m1 << "_m"<< m2 << "_w_gf_meson.xml";
|
||||
wssz<<config<< "_m" << m1 << "_m"<< m2 << "_w_z2_meson.xml";
|
||||
|
||||
WallSinkMesonTrace(wssg.str(),wsnk_gfProps[m1],wsnk_gfProps[m2]);
|
||||
WallSinkMesonTrace(wssz.str(),wsnk_z2Props[m1],wsnk_z2Props[m2]);
|
||||
|
||||
}}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
||||
|
129
systems/Booster/comms.4node.perf
Normal file
129
systems/Booster/comms.4node.perf
Normal file
@ -0,0 +1,129 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x1463a0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.729967 s : Grid is setup to use 4 threads
|
||||
Grid : Message : 0.729975 s : Number of iterations to average: 250
|
||||
Grid : Message : 0.729977 s : ====================================================================================================
|
||||
Grid : Message : 0.729978 s : = Benchmarking sequential halo exchange from host memory
|
||||
Grid : Message : 0.729979 s : ====================================================================================================
|
||||
Grid : Message : 0.729980 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
|
||||
Grid : Message : 0.749870 s : 8 8 393216 50783.4 101566.8
|
||||
Grid : Message : 0.764282 s : 8 8 393216 54704.5 109409.0
|
||||
Grid : Message : 0.780310 s : 8 8 393216 49090.6 98181.3
|
||||
Grid : Message : 0.796479 s : 8 8 393216 48662.3 97324.7
|
||||
Grid : Message : 0.841551 s : 12 8 1327104 66728.9 133457.8
|
||||
Grid : Message : 0.880653 s : 12 8 1327104 67932.9 135865.9
|
||||
Grid : Message : 0.920097 s : 12 8 1327104 67304.2 134608.4
|
||||
Grid : Message : 0.961444 s : 12 8 1327104 64205.9 128411.8
|
||||
Grid : Message : 1.660890 s : 16 8 3145728 67833.1 135666.3
|
||||
Grid : Message : 1.153006 s : 16 8 3145728 72416.3 144832.6
|
||||
Grid : Message : 1.240962 s : 16 8 3145728 71536.1 143072.2
|
||||
Grid : Message : 1.330372 s : 16 8 3145728 70372.7 140745.3
|
||||
Grid : Message : 1.519996 s : 20 8 6144000 71017.4 142034.8
|
||||
Grid : Message : 1.667745 s : 20 8 6144000 83189.5 166378.9
|
||||
Grid : Message : 1.817908 s : 20 8 6144000 81836.5 163673.1
|
||||
Grid : Message : 1.969344 s : 20 8 6144000 81148.0 162296.0
|
||||
Grid : Message : 2.260249 s : 24 8 10616832 79299.9 158599.8
|
||||
Grid : Message : 2.512319 s : 24 8 10616832 84249.2 168498.4
|
||||
Grid : Message : 2.763820 s : 24 8 10616832 84430.4 168860.9
|
||||
Grid : Message : 3.172850 s : 24 8 10616832 83776.5 167553.1
|
||||
Grid : Message : 3.460951 s : 28 8 16859136 82176.6 164353.1
|
||||
Grid : Message : 3.859348 s : 28 8 16859136 84642.9 169285.9
|
||||
Grid : Message : 4.254351 s : 28 8 16859136 85366.0 170731.9
|
||||
Grid : Message : 4.651748 s : 28 8 16859136 84850.2 169700.4
|
||||
Grid : Message : 5.302166 s : 32 8 25165824 83402.1 166804.1
|
||||
Grid : Message : 5.889123 s : 32 8 25165824 85756.3 171512.6
|
||||
Grid : Message : 6.472357 s : 32 8 25165824 86299.1 172598.3
|
||||
Grid : Message : 7.572140 s : 32 8 25165824 86059.7 172119.3
|
||||
Grid : Message : 7.578700 s : ====================================================================================================
|
||||
Grid : Message : 7.578740 s : = Benchmarking sequential halo exchange from GPU memory
|
||||
Grid : Message : 7.578750 s : ====================================================================================================
|
||||
Grid : Message : 7.578760 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max)
|
||||
Grid : Message : 7.119231 s : 8 8 393216 13844.9 27689.8
|
||||
Grid : Message : 7.150661 s : 8 8 393216 25034.4 50068.9
|
||||
Grid : Message : 7.173800 s : 8 8 393216 34002.0 68004.0
|
||||
Grid : Message : 7.197415 s : 8 8 393216 33317.7 66635.5
|
||||
Grid : Message : 7.240696 s : 12 8 1327104 110772.0 221544.0
|
||||
Grid : Message : 7.263466 s : 12 8 1327104 116627.5 233254.9
|
||||
Grid : Message : 7.310752 s : 12 8 1327104 56142.8 112285.6
|
||||
Grid : Message : 7.356881 s : 12 8 1327104 57551.3 115102.6
|
||||
Grid : Message : 7.422351 s : 16 8 3145728 167086.0 334172.0
|
||||
Grid : Message : 7.458334 s : 16 8 3145728 174903.6 349807.1
|
||||
Grid : Message : 7.558746 s : 16 8 3145728 62663.3 125326.6
|
||||
Grid : Message : 7.658824 s : 16 8 3145728 62871.8 125743.6
|
||||
Grid : Message : 7.741423 s : 20 8 6144000 231840.3 463680.6
|
||||
Grid : Message : 7.794862 s : 20 8 6144000 229996.1 459992.1
|
||||
Grid : Message : 7.982472 s : 20 8 6144000 65501.1 131002.1
|
||||
Grid : Message : 8.170548 s : 20 8 6144000 65338.8 130677.5
|
||||
Grid : Message : 8.277182 s : 24 8 10616832 274319.0 548638.0
|
||||
Grid : Message : 8.354585 s : 24 8 10616832 274365.1 548730.2
|
||||
Grid : Message : 8.675675 s : 24 8 10616832 66132.8 132265.7
|
||||
Grid : Message : 8.999237 s : 24 8 10616832 65627.4 131254.7
|
||||
Grid : Message : 9.140302 s : 28 8 16859136 300825.0 601650.0
|
||||
Grid : Message : 9.251320 s : 28 8 16859136 303749.1 607498.1
|
||||
Grid : Message : 9.632241 s : 28 8 16859136 88520.3 177040.6
|
||||
Grid : Message : 9.999663 s : 28 8 16859136 91772.9 183545.7
|
||||
Grid : Message : 10.183071 s : 32 8 25165824 328325.5 656651.1
|
||||
Grid : Message : 10.335093 s : 32 8 25165824 331109.7 662219.3
|
||||
Grid : Message : 10.875980 s : 32 8 25165824 93056.0 186111.9
|
||||
Grid : Message : 11.418666 s : 32 8 25165824 92747.5 185495.0
|
||||
Grid : Message : 11.434792 s : ====================================================================================================
|
||||
Grid : Message : 11.434797 s : = All done; Bye Bye
|
||||
Grid : Message : 11.434798 s : ====================================================================================================
|
14
systems/Booster/config-command
Normal file
14
systems/Booster/config-command
Normal file
@ -0,0 +1,14 @@
|
||||
LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=cuda \
|
||||
--with-lime=$LIME \
|
||||
--disable-accelerator-cshift \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
||||
|
156
systems/Booster/dwf.16node.perf
Normal file
156
systems/Booster/dwf.16node.perf
Normal file
@ -0,0 +1,156 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 64
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ac40000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=f660dc67e4b193afc4015bc5e5fe47cfdbb0356e: (HEAD -> develop, origin/develop, origin/HEAD) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.910318 s : Grid Layout
|
||||
Grid : Message : 0.910320 s : Global lattice size : 64 64 64 256
|
||||
Grid : Message : 0.910325 s : OpenMP threads : 4
|
||||
Grid : Message : 0.910326 s : MPI tasks : 2 2 2 8
|
||||
Grid : Message : 0.973956 s : Making s innermost grids
|
||||
Grid : Message : 1.198830 s : Initialising 4d RNG
|
||||
Grid : Message : 1.119813 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 1.119870 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 2.683307 s : Initialising 5d RNG
|
||||
Grid : Message : 4.220535 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 4.220563 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 37.198140 s : Initialised RNGs
|
||||
Grid : Message : 39.952612 s : Drawing gauge field
|
||||
Grid : Message : 40.488019 s : Random gauge initialised
|
||||
Grid : Message : 42.659220 s : Setting up Cshift based reference
|
||||
Grid : Message : 47.622210 s : *****************************************************************
|
||||
Grid : Message : 47.622236 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 47.622237 s : *****************************************************************
|
||||
Grid : Message : 47.622238 s : *****************************************************************
|
||||
Grid : Message : 47.622239 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 47.622240 s : * Vectorising space-time by 8
|
||||
Grid : Message : 47.622241 s : * VComplexF size is 64 B
|
||||
Grid : Message : 47.622242 s : * SINGLE precision
|
||||
Grid : Message : 47.622243 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 47.622244 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 47.622245 s : *****************************************************************
|
||||
Grid : Message : 48.950210 s : Called warmup
|
||||
Grid : Message : 77.311124 s : Called Dw 3000 times in 2.83592e+07 us
|
||||
Grid : Message : 77.311181 s : mflop/s = 1.49934e+08
|
||||
Grid : Message : 77.311184 s : mflop/s per rank = 2.34273e+06
|
||||
Grid : Message : 77.311185 s : mflop/s per node = 9.37091e+06
|
||||
Grid : Message : 77.311186 s : RF GiB/s (base 2) = 304663
|
||||
Grid : Message : 77.311187 s : mem GiB/s (base 2) = 190415
|
||||
Grid : Message : 77.314752 s : norm diff 1.03478e-13
|
||||
Grid : Message : 77.349587 s : #### Dhop calls report
|
||||
Grid : Message : 77.349591 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 77.349613 s : WilsonFermion5D TotalTime /Calls : 4761.53 us
|
||||
Grid : Message : 77.349615 s : WilsonFermion5D CommTime /Calls : 3363.09 us
|
||||
Grid : Message : 77.349616 s : WilsonFermion5D FaceTime /Calls : 469.094 us
|
||||
Grid : Message : 77.349617 s : WilsonFermion5D ComputeTime1/Calls : 26.8794 us
|
||||
Grid : Message : 77.349618 s : WilsonFermion5D ComputeTime2/Calls : 949.276 us
|
||||
Grid : Message : 77.349702 s : Average mflops/s per call : 2.68569e+10
|
||||
Grid : Message : 77.349710 s : Average mflops/s per call per rank : 4.1964e+08
|
||||
Grid : Message : 77.349711 s : Average mflops/s per call per node : 1.67856e+09
|
||||
Grid : Message : 77.349712 s : Average mflops/s per call (full) : 1.51538e+08
|
||||
Grid : Message : 77.349713 s : Average mflops/s per call per rank (full): 2.36779e+06
|
||||
Grid : Message : 77.349714 s : Average mflops/s per call per node (full): 9.47115e+06
|
||||
Grid : Message : 77.349715 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 77.349716 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 77.349717 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 77.349718 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 77.349719 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 77.349720 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 104.883719 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 104.883743 s : Called DwDag
|
||||
Grid : Message : 104.883744 s : norm dag result 12.0421
|
||||
Grid : Message : 104.901901 s : norm dag ref 12.0421
|
||||
Grid : Message : 104.917822 s : norm dag diff 7.63254e-14
|
||||
Grid : Message : 104.957229 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 105.334551 s : src_e0.499998
|
||||
Grid : Message : 105.416616 s : src_o0.500002
|
||||
Grid : Message : 105.486729 s : *********************************************************
|
||||
Grid : Message : 105.486732 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 105.486733 s : * Vectorising space-time by 8
|
||||
Grid : Message : 105.486734 s : * SINGLE precision
|
||||
Grid : Message : 105.486739 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 105.486740 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 105.486741 s : *********************************************************
|
||||
Grid : Message : 119.695464 s : Deo mflop/s = 1.5039e+08
|
||||
Grid : Message : 119.695494 s : Deo mflop/s per rank 2.34984e+06
|
||||
Grid : Message : 119.695496 s : Deo mflop/s per node 9.39937e+06
|
||||
Grid : Message : 119.695502 s : #### Dhop calls report
|
||||
Grid : Message : 119.695503 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 119.695505 s : WilsonFermion5D TotalTime /Calls : 4734.45 us
|
||||
Grid : Message : 119.695507 s : WilsonFermion5D CommTime /Calls : 3287.23 us
|
||||
Grid : Message : 119.695508 s : WilsonFermion5D FaceTime /Calls : 537.724 us
|
||||
Grid : Message : 119.695509 s : WilsonFermion5D ComputeTime1/Calls : 16.0483 us
|
||||
Grid : Message : 119.695510 s : WilsonFermion5D ComputeTime2/Calls : 939.854 us
|
||||
Grid : Message : 119.695533 s : Average mflops/s per call : 4.50726e+10
|
||||
Grid : Message : 119.695535 s : Average mflops/s per call per rank : 7.04259e+08
|
||||
Grid : Message : 119.695536 s : Average mflops/s per call per node : 2.81703e+09
|
||||
Grid : Message : 119.695537 s : Average mflops/s per call (full) : 1.52405e+08
|
||||
Grid : Message : 119.695538 s : Average mflops/s per call per rank (full): 2.38133e+06
|
||||
Grid : Message : 119.695539 s : Average mflops/s per call per node (full): 9.52532e+06
|
||||
Grid : Message : 119.695540 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 119.695541 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 119.695542 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 119.695543 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 119.695544 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 119.695545 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 119.752707 s : r_e6.02108
|
||||
Grid : Message : 119.759448 s : r_o6.02101
|
||||
Grid : Message : 119.765382 s : res12.0421
|
||||
Grid : Message : 120.419093 s : norm diff 0
|
||||
Grid : Message : 120.829772 s : norm diff even 0
|
||||
Grid : Message : 120.909078 s : norm diff odd 0
|
156
systems/Booster/dwf.4node.perf
Normal file
156
systems/Booster/dwf.4node.perf
Normal file
@ -0,0 +1,156 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42505273344
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 3
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e9c0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34004218675 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.717713 s : Grid Layout
|
||||
Grid : Message : 0.717716 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.717724 s : OpenMP threads : 4
|
||||
Grid : Message : 0.717725 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.801634 s : Making s innermost grids
|
||||
Grid : Message : 0.844903 s : Initialising 4d RNG
|
||||
Grid : Message : 0.940001 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.940060 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 1.338368 s : Initialising 5d RNG
|
||||
Grid : Message : 2.859273 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 2.859304 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 11.140924 s : Initialised RNGs
|
||||
Grid : Message : 13.433456 s : Drawing gauge field
|
||||
Grid : Message : 13.955847 s : Random gauge initialised
|
||||
Grid : Message : 15.528535 s : Setting up Cshift based reference
|
||||
Grid : Message : 21.484340 s : *****************************************************************
|
||||
Grid : Message : 21.484840 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 21.484860 s : *****************************************************************
|
||||
Grid : Message : 21.484870 s : *****************************************************************
|
||||
Grid : Message : 21.484880 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 21.484890 s : * Vectorising space-time by 8
|
||||
Grid : Message : 21.484900 s : * VComplexF size is 64 B
|
||||
Grid : Message : 21.484910 s : * SINGLE precision
|
||||
Grid : Message : 21.484920 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 21.484930 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 21.484940 s : *****************************************************************
|
||||
Grid : Message : 22.344741 s : Called warmup
|
||||
Grid : Message : 49.832292 s : Called Dw 3000 times in 2.74873e+07 us
|
||||
Grid : Message : 49.832358 s : mflop/s = 3.86726e+07
|
||||
Grid : Message : 49.832360 s : mflop/s per rank = 2.41704e+06
|
||||
Grid : Message : 49.832361 s : mflop/s per node = 9.66814e+06
|
||||
Grid : Message : 49.832362 s : RF GiB/s (base 2) = 78581.7
|
||||
Grid : Message : 49.832363 s : mem GiB/s (base 2) = 49113.6
|
||||
Grid : Message : 49.835924 s : norm diff 1.03481e-13
|
||||
Grid : Message : 49.870568 s : #### Dhop calls report
|
||||
Grid : Message : 49.870574 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 49.870598 s : WilsonFermion5D TotalTime /Calls : 4616.79 us
|
||||
Grid : Message : 49.870600 s : WilsonFermion5D CommTime /Calls : 3241.77 us
|
||||
Grid : Message : 49.870601 s : WilsonFermion5D FaceTime /Calls : 469.006 us
|
||||
Grid : Message : 49.870602 s : WilsonFermion5D ComputeTime1/Calls : 27.0492 us
|
||||
Grid : Message : 49.870603 s : WilsonFermion5D ComputeTime2/Calls : 926.33 us
|
||||
Grid : Message : 49.870614 s : Average mflops/s per call : 6.71631e+09
|
||||
Grid : Message : 49.870619 s : Average mflops/s per call per rank : 4.19769e+08
|
||||
Grid : Message : 49.870621 s : Average mflops/s per call per node : 1.67908e+09
|
||||
Grid : Message : 49.870626 s : Average mflops/s per call (full) : 3.90723e+07
|
||||
Grid : Message : 49.870627 s : Average mflops/s per call per rank (full): 2.44202e+06
|
||||
Grid : Message : 49.870628 s : Average mflops/s per call per node (full): 9.76808e+06
|
||||
Grid : Message : 49.870629 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 49.870630 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 49.870631 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 49.870632 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 49.870633 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 49.870634 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 77.321890 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 77.321911 s : Called DwDag
|
||||
Grid : Message : 77.321912 s : norm dag result 12.0421
|
||||
Grid : Message : 77.334619 s : norm dag ref 12.0421
|
||||
Grid : Message : 77.350515 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 77.389923 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 77.769815 s : src_e0.499997
|
||||
Grid : Message : 77.847560 s : src_o0.500003
|
||||
Grid : Message : 77.917493 s : *********************************************************
|
||||
Grid : Message : 77.917496 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 77.917497 s : * Vectorising space-time by 8
|
||||
Grid : Message : 77.917498 s : * SINGLE precision
|
||||
Grid : Message : 77.917499 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 77.917500 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 77.917501 s : *********************************************************
|
||||
Grid : Message : 91.412946 s : Deo mflop/s = 3.95925e+07
|
||||
Grid : Message : 91.412978 s : Deo mflop/s per rank 2.47453e+06
|
||||
Grid : Message : 91.412980 s : Deo mflop/s per node 9.89813e+06
|
||||
Grid : Message : 91.412983 s : #### Dhop calls report
|
||||
Grid : Message : 91.412984 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 91.412986 s : WilsonFermion5D TotalTime /Calls : 4496.84 us
|
||||
Grid : Message : 91.412988 s : WilsonFermion5D CommTime /Calls : 3057.28 us
|
||||
Grid : Message : 91.412989 s : WilsonFermion5D FaceTime /Calls : 528.499 us
|
||||
Grid : Message : 91.412990 s : WilsonFermion5D ComputeTime1/Calls : 16.1939 us
|
||||
Grid : Message : 91.412991 s : WilsonFermion5D ComputeTime2/Calls : 942.557 us
|
||||
Grid : Message : 91.413021 s : Average mflops/s per call : 1.12574e+10
|
||||
Grid : Message : 91.413023 s : Average mflops/s per call per rank : 7.03586e+08
|
||||
Grid : Message : 91.413024 s : Average mflops/s per call per node : 2.81434e+09
|
||||
Grid : Message : 91.413025 s : Average mflops/s per call (full) : 4.01145e+07
|
||||
Grid : Message : 91.413026 s : Average mflops/s per call per rank (full): 2.50716e+06
|
||||
Grid : Message : 91.413027 s : Average mflops/s per call per node (full): 1.00286e+07
|
||||
Grid : Message : 91.413028 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 91.413029 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 91.413030 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 91.413031 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 91.413032 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 91.413033 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 91.470394 s : r_e6.02111
|
||||
Grid : Message : 91.476539 s : r_o6.02102
|
||||
Grid : Message : 91.482442 s : res12.0421
|
||||
Grid : Message : 92.138799 s : norm diff 0
|
||||
Grid : Message : 92.545354 s : norm diff even 0
|
||||
Grid : Message : 92.619444 s : norm diff odd 0
|
29
systems/Booster/dwf16.slurm
Normal file
29
systems/Booster/dwf16.slurm
Normal file
@ -0,0 +1,29 @@
|
||||
#!/bin/sh
|
||||
#SBATCH --account=gm2dwf
|
||||
#SBATCH --nodes=16
|
||||
#SBATCH --ntasks=64
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=12
|
||||
#SBATCH --time=0:30:00
|
||||
#SBATCH --partition=booster
|
||||
#SBATCH --gres=gpu:4
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
OPT="--comms-overlap --comms-concurrent"
|
||||
|
||||
|
||||
srun -N 16 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_dwf_fp32 \
|
||||
$OPT \
|
||||
--mpi 2.2.2.8 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.256 \
|
||||
--shm 2048 > dwf.16node.perf
|
||||
|
||||
|
39
systems/Booster/dwf4.slurm
Normal file
39
systems/Booster/dwf4.slurm
Normal file
@ -0,0 +1,39 @@
|
||||
#!/bin/sh
|
||||
#SBATCH --account=gm2dwf
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --ntasks=16
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH --cpus-per-task=12
|
||||
#SBATCH --time=2:00:00
|
||||
#SBATCH --partition=develbooster
|
||||
#SBATCH --gres=gpu:4
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent"
|
||||
|
||||
srun -N 4 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_dwf_fp32 \
|
||||
$OPT \
|
||||
--mpi 2.2.2.2 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.64 \
|
||||
--shm 2048 > dwf.4node.perf
|
||||
|
||||
|
||||
srun -N 4 -n $SLURM_NTASKS \
|
||||
./benchmarks/Benchmark_comms_host_device \
|
||||
--mpi 2.2.2.2 \
|
||||
--accelerator-threads 8 \
|
||||
--grid 64.64.64.64 \
|
||||
--shm 2048 > comms.4node.perf
|
||||
|
||||
|
||||
|
||||
|
5
systems/Booster/sourceme.sh
Normal file
5
systems/Booster/sourceme.sh
Normal file
@ -0,0 +1,5 @@
|
||||
module load GCC/9.3.0
|
||||
module load GMP/6.2.0
|
||||
module load MPFR/4.1.0
|
||||
module load OpenMPI/4.1.0rc1
|
||||
module load CUDA/11.3
|
12
systems/Crusher/config-command
Normal file
12
systems/Crusher/config-command
Normal file
@ -0,0 +1,12 @@
|
||||
../../configure --enable-comms=mpi-auto \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-simd=GPU \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I/opt/rocm-4.5.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
||||
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
|
||||
HIPFLAGS = --amdgpu-target=gfx90a
|
30
systems/Crusher/dwf.slurm
Normal file
30
systems/Crusher/dwf.slurm
Normal file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
##SBATCH -p ecp
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 1
|
||||
#SBATCH --exclusive
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
#export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
AT=8
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
|
||||
PARAMS=" --accelerator-threads ${AT} --grid 24.24.24.24 --shm-mpi 0 --mpi 1.1.1.1"
|
||||
|
||||
srun --gpus-per-task 1 -n1 ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
||||
|
27
systems/Crusher/dwf4.slurm
Normal file
27
systems/Crusher/dwf4.slurm
Normal file
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 4
|
||||
#SBATCH --exclusive
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads 8 --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
|
||||
srun --gpus-per-task 1 -n4 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
||||
|
27
systems/Crusher/dwf8.slurm
Normal file
27
systems/Crusher/dwf8.slurm
Normal file
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 8
|
||||
#SBATCH --exclusive
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads 8 --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
|
||||
srun --gpus-per-task 1 -n8 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
||||
|
12
systems/Crusher/mpiwrapper.sh
Executable file
12
systems/Crusher/mpiwrapper.sh
Executable file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
lrank=$SLURM_LOCALID
|
||||
|
||||
export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
|
||||
|
||||
echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
|
||||
|
||||
$*
|
||||
|
||||
|
||||
|
5
systems/Crusher/sourceme.sh
Normal file
5
systems/Crusher/sourceme.sh
Normal file
@ -0,0 +1,5 @@
|
||||
module load PrgEnv-gnu
|
||||
module load rocm/4.5.0
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
129
systems/Perlmutter/comms.4node
Normal file
129
systems/Perlmutter/comms.4node
Normal file
@ -0,0 +1,129 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x7f8d40000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 1073741824 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.956704 s : Grid is setup to use 32 threads
|
||||
Grid : Message : 0.956709 s : Number of iterations to average: 250
|
||||
Grid : Message : 0.956712 s : ====================================================================================================
|
||||
Grid : Message : 0.956713 s : = Benchmarking sequential halo exchange from host memory
|
||||
Grid : Message : 0.956714 s : ====================================================================================================
|
||||
Grid : Message : 0.956715 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 1.108420 s : 8 8 393216 15427.2 30854.4
|
||||
Grid : Message : 1.198740 s : 8 8 393216 87332.8 174665.6
|
||||
Grid : Message : 1.574400 s : 8 8 393216 20938.0 41876.0
|
||||
Grid : Message : 1.956280 s : 8 8 393216 20598.0 41196.0
|
||||
Grid : Message : 1.125254 s : 12 8 1327104 105614.9 211229.8
|
||||
Grid : Message : 1.149709 s : 12 8 1327104 108578.8 217157.5
|
||||
Grid : Message : 1.262612 s : 12 8 1327104 23510.2 47020.4
|
||||
Grid : Message : 1.377804 s : 12 8 1327104 23043.0 46086.0
|
||||
Grid : Message : 1.445986 s : 16 8 3145728 107931.9 215863.7
|
||||
Grid : Message : 1.501495 s : 16 8 3145728 113380.0 226760.0
|
||||
Grid : Message : 1.766377 s : 16 8 3145728 23752.8 47505.6
|
||||
Grid : Message : 2.301720 s : 16 8 3145728 23850.6 47701.2
|
||||
Grid : Message : 2.158035 s : 20 8 6144000 109657.5 219315.0
|
||||
Grid : Message : 2.268232 s : 20 8 6144000 111535.7 223071.4
|
||||
Grid : Message : 2.779996 s : 20 8 6144000 24011.8 48023.6
|
||||
Grid : Message : 3.289081 s : 20 8 6144000 24137.8 48275.7
|
||||
Grid : Message : 3.549101 s : 24 8 10616832 89696.1 179392.2
|
||||
Grid : Message : 3.779416 s : 24 8 10616832 92205.2 184410.4
|
||||
Grid : Message : 4.656539 s : 24 8 10616832 24209.0 48417.9
|
||||
Grid : Message : 5.531893 s : 24 8 10616832 24257.5 48515.0
|
||||
Grid : Message : 6.800400 s : 28 8 16859136 76106.8 152213.6
|
||||
Grid : Message : 6.443946 s : 28 8 16859136 77350.6 154701.1
|
||||
Grid : Message : 7.830994 s : 28 8 16859136 24309.8 48619.6
|
||||
Grid : Message : 9.215301 s : 28 8 16859136 24357.8 48715.5
|
||||
Grid : Message : 9.955615 s : 32 8 25165824 72403.7 144807.4
|
||||
Grid : Message : 10.648284 s : 32 8 25165824 72666.2 145332.4
|
||||
Grid : Message : 12.713098 s : 32 8 25165824 24376.2 48752.3
|
||||
Grid : Message : 14.775577 s : 32 8 25165824 24403.6 48807.3
|
||||
Grid : Message : 14.777794 s : ====================================================================================================
|
||||
Grid : Message : 14.777799 s : = Benchmarking sequential halo exchange from GPU memory
|
||||
Grid : Message : 14.777800 s : ====================================================================================================
|
||||
Grid : Message : 14.777801 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 14.798392 s : 8 8 393216 49210.4 98420.9
|
||||
Grid : Message : 14.812519 s : 8 8 393216 55716.0 111432.1
|
||||
Grid : Message : 14.861908 s : 8 8 393216 15926.4 31852.9
|
||||
Grid : Message : 14.909307 s : 8 8 393216 16594.5 33189.1
|
||||
Grid : Message : 14.938366 s : 12 8 1327104 157435.7 314871.3
|
||||
Grid : Message : 14.954490 s : 12 8 1327104 164724.6 329449.3
|
||||
Grid : Message : 15.921650 s : 12 8 1327104 19280.2 38560.4
|
||||
Grid : Message : 15.229618 s : 12 8 1327104 19311.3 38622.7
|
||||
Grid : Message : 15.275707 s : 16 8 3145728 221257.5 442514.9
|
||||
Grid : Message : 15.303489 s : 16 8 3145728 226547.7 453095.4
|
||||
Grid : Message : 15.619610 s : 16 8 3145728 19902.6 39805.2
|
||||
Grid : Message : 15.935287 s : 16 8 3145728 19930.6 39861.2
|
||||
Grid : Message : 15.999038 s : 20 8 6144000 269586.0 539172.0
|
||||
Grid : Message : 16.435890 s : 20 8 6144000 275886.8 551773.7
|
||||
Grid : Message : 16.652349 s : 20 8 6144000 20185.6 40371.2
|
||||
Grid : Message : 17.262005 s : 20 8 6144000 20156.0 40311.9
|
||||
Grid : Message : 17.351417 s : 24 8 10616832 300428.2 600856.4
|
||||
Grid : Message : 17.421125 s : 24 8 10616832 304656.8 609313.6
|
||||
Grid : Message : 18.477072 s : 24 8 10616832 20108.9 40217.7
|
||||
Grid : Message : 19.556481 s : 24 8 10616832 19671.8 39343.6
|
||||
Grid : Message : 19.681365 s : 28 8 16859136 318966.5 637933.1
|
||||
Grid : Message : 19.786400 s : 28 8 16859136 321056.1 642112.1
|
||||
Grid : Message : 21.531557 s : 28 8 16859136 19321.2 38642.4
|
||||
Grid : Message : 23.384312 s : 28 8 16859136 18199.2 36398.3
|
||||
Grid : Message : 23.556358 s : 32 8 25165824 332397.6 664795.2
|
||||
Grid : Message : 23.706392 s : 32 8 25165824 335492.9 670985.8
|
||||
Grid : Message : 26.356425 s : 32 8 25165824 18992.9 37985.9
|
||||
Grid : Message : 29.126692 s : 32 8 25165824 18168.6 36337.3
|
||||
Grid : Message : 29.137480 s : ====================================================================================================
|
||||
Grid : Message : 29.137485 s : = All done; Bye Bye
|
||||
Grid : Message : 29.137486 s : ====================================================================================================
|
12
systems/Perlmutter/config-command
Normal file
12
systems/Perlmutter/config-command
Normal file
@ -0,0 +1,12 @@
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-shm=nvlink \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-fermion-reps \
|
||||
--disable-unified \
|
||||
--disable-gparity \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
CXXFLAGS="-ccbin CC -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt0
Normal file
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt0
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fc320000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.762377 s : Grid Layout
|
||||
Grid : Message : 0.762378 s : Global lattice size : 48 48 48 48
|
||||
Grid : Message : 0.762381 s : OpenMP threads : 32
|
||||
Grid : Message : 0.762382 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.790912 s : Making s innermost grids
|
||||
Grid : Message : 0.817408 s : Initialising 4d RNG
|
||||
Grid : Message : 0.840908 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.840921 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.911684 s : Initialising 5d RNG
|
||||
Grid : Message : 1.270530 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.270544 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 1.568435 s : Initialised RNGs
|
||||
Grid : Message : 2.241446 s : Drawing gauge field
|
||||
Grid : Message : 2.318921 s : Random gauge initialised
|
||||
Grid : Message : 2.779258 s : Setting up Cshift based reference
|
||||
Grid : Message : 3.188306 s : *****************************************************************
|
||||
Grid : Message : 3.188315 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 3.188316 s : *****************************************************************
|
||||
Grid : Message : 3.188316 s : *****************************************************************
|
||||
Grid : Message : 3.188316 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 3.188316 s : * Vectorising space-time by 8
|
||||
Grid : Message : 3.188317 s : * VComplexF size is 64 B
|
||||
Grid : Message : 3.188318 s : * SINGLE precision
|
||||
Grid : Message : 3.188318 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 3.188318 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 3.188318 s : *****************************************************************
|
||||
Grid : Message : 3.548355 s : Called warmup
|
||||
Grid : Message : 37.809000 s : Called Dw 3000 times in 3.42606e+07 us
|
||||
Grid : Message : 37.809040 s : mflop/s = 9.81714e+06
|
||||
Grid : Message : 37.809042 s : mflop/s per rank = 613572
|
||||
Grid : Message : 37.809043 s : mflop/s per node = 2.45429e+06
|
||||
Grid : Message : 37.809044 s : RF GiB/s (base 2) = 19948.2
|
||||
Grid : Message : 37.809045 s : mem GiB/s (base 2) = 12467.6
|
||||
Grid : Message : 37.810181 s : norm diff 1.03662e-13
|
||||
Grid : Message : 37.824163 s : #### Dhop calls report
|
||||
Grid : Message : 37.824168 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 37.824172 s : WilsonFermion5D TotalTime /Calls : 5719.36 us
|
||||
Grid : Message : 37.824173 s : WilsonFermion5D CommTime /Calls : 5085.34 us
|
||||
Grid : Message : 37.824174 s : WilsonFermion5D FaceTime /Calls : 265.445 us
|
||||
Grid : Message : 37.824175 s : WilsonFermion5D ComputeTime1/Calls : 23.4602 us
|
||||
Grid : Message : 37.824176 s : WilsonFermion5D ComputeTime2/Calls : 370.89 us
|
||||
Grid : Message : 37.824191 s : Average mflops/s per call : 2.36923e+09
|
||||
Grid : Message : 37.824194 s : Average mflops/s per call per rank : 1.48077e+08
|
||||
Grid : Message : 37.824195 s : Average mflops/s per call per node : 5.92307e+08
|
||||
Grid : Message : 37.824196 s : Average mflops/s per call (full) : 9.97945e+06
|
||||
Grid : Message : 37.824197 s : Average mflops/s per call per rank (full): 623716
|
||||
Grid : Message : 37.824198 s : Average mflops/s per call per node (full): 2.49486e+06
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 41.538537 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 41.538549 s : Called DwDag
|
||||
Grid : Message : 41.538550 s : norm dag result 12.0422
|
||||
Grid : Message : 41.543416 s : norm dag ref 12.0422
|
||||
Grid : Message : 41.548999 s : norm dag diff 7.6086e-14
|
||||
Grid : Message : 41.563564 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 41.711516 s : src_e0.499992
|
||||
Grid : Message : 41.735103 s : src_o0.500008
|
||||
Grid : Message : 41.756142 s : *********************************************************
|
||||
Grid : Message : 41.756144 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 41.756145 s : * Vectorising space-time by 8
|
||||
Grid : Message : 41.756146 s : * SINGLE precision
|
||||
Grid : Message : 41.756147 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 41.756148 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 41.756148 s : *********************************************************
|
||||
Grid : Message : 59.255023 s : Deo mflop/s = 9.6274e+06
|
||||
Grid : Message : 59.255044 s : Deo mflop/s per rank 601712
|
||||
Grid : Message : 59.255046 s : Deo mflop/s per node 2.40685e+06
|
||||
Grid : Message : 59.255048 s : #### Dhop calls report
|
||||
Grid : Message : 59.255049 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 59.255050 s : WilsonFermion5D TotalTime /Calls : 5830.89 us
|
||||
Grid : Message : 59.255051 s : WilsonFermion5D CommTime /Calls : 5143.28 us
|
||||
Grid : Message : 59.255052 s : WilsonFermion5D FaceTime /Calls : 316.834 us
|
||||
Grid : Message : 59.255053 s : WilsonFermion5D ComputeTime1/Calls : 37.4065 us
|
||||
Grid : Message : 59.255054 s : WilsonFermion5D ComputeTime2/Calls : 375.889 us
|
||||
Grid : Message : 59.255076 s : Average mflops/s per call : 1.4225e+09
|
||||
Grid : Message : 59.255077 s : Average mflops/s per call per rank : 8.8906e+07
|
||||
Grid : Message : 59.255078 s : Average mflops/s per call per node : 3.55624e+08
|
||||
Grid : Message : 59.255079 s : Average mflops/s per call (full) : 9.78858e+06
|
||||
Grid : Message : 59.255080 s : Average mflops/s per call per rank (full): 611786
|
||||
Grid : Message : 59.255081 s : Average mflops/s per call per node (full): 2.44714e+06
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 59.286796 s : r_e6.02129
|
||||
Grid : Message : 59.290118 s : r_o6.02097
|
||||
Grid : Message : 59.292558 s : res12.0423
|
||||
Grid : Message : 59.482803 s : norm diff 0
|
||||
Grid : Message : 59.604297 s : norm diff even 0
|
||||
Grid : Message : 59.626743 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt1
Normal file
156
systems/Perlmutter/dwf.48.48.48.48.4node.opt1
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fbae0000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.692368 s : Grid Layout
|
||||
Grid : Message : 0.692369 s : Global lattice size : 48 48 48 48
|
||||
Grid : Message : 0.692372 s : OpenMP threads : 32
|
||||
Grid : Message : 0.692372 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.701977 s : Making s innermost grids
|
||||
Grid : Message : 0.711295 s : Initialising 4d RNG
|
||||
Grid : Message : 0.734938 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.734948 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.798281 s : Initialising 5d RNG
|
||||
Grid : Message : 1.161711 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.161728 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 1.522440 s : Initialised RNGs
|
||||
Grid : Message : 2.260710 s : Drawing gauge field
|
||||
Grid : Message : 2.102597 s : Random gauge initialised
|
||||
Grid : Message : 2.562592 s : Setting up Cshift based reference
|
||||
Grid : Message : 3.121880 s : *****************************************************************
|
||||
Grid : Message : 3.121970 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 3.121980 s : *****************************************************************
|
||||
Grid : Message : 3.121980 s : *****************************************************************
|
||||
Grid : Message : 3.121980 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 3.121980 s : * Vectorising space-time by 8
|
||||
Grid : Message : 3.121980 s : * VComplexF size is 64 B
|
||||
Grid : Message : 3.121990 s : * SINGLE precision
|
||||
Grid : Message : 3.121990 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 3.121990 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 3.121990 s : *****************************************************************
|
||||
Grid : Message : 3.350688 s : Called warmup
|
||||
Grid : Message : 35.847527 s : Called Dw 3000 times in 3.24968e+07 us
|
||||
Grid : Message : 35.847576 s : mflop/s = 1.035e+07
|
||||
Grid : Message : 35.847578 s : mflop/s per rank = 646874
|
||||
Grid : Message : 35.847579 s : mflop/s per node = 2.5875e+06
|
||||
Grid : Message : 35.847580 s : RF GiB/s (base 2) = 21030.9
|
||||
Grid : Message : 35.847581 s : mem GiB/s (base 2) = 13144.3
|
||||
Grid : Message : 35.848697 s : norm diff 1.03662e-13
|
||||
Grid : Message : 35.861967 s : #### Dhop calls report
|
||||
Grid : Message : 35.861973 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 35.861976 s : WilsonFermion5D TotalTime /Calls : 5426 us
|
||||
Grid : Message : 35.861977 s : WilsonFermion5D CommTime /Calls : 4817.47 us
|
||||
Grid : Message : 35.861978 s : WilsonFermion5D FaceTime /Calls : 246.175 us
|
||||
Grid : Message : 35.861979 s : WilsonFermion5D ComputeTime1/Calls : 8.72676 us
|
||||
Grid : Message : 35.861980 s : WilsonFermion5D ComputeTime2/Calls : 370.494 us
|
||||
Grid : Message : 35.861995 s : Average mflops/s per call : 6.50606e+09
|
||||
Grid : Message : 35.861999 s : Average mflops/s per call per rank : 4.06629e+08
|
||||
Grid : Message : 35.862000 s : Average mflops/s per call per node : 1.62652e+09
|
||||
Grid : Message : 35.862001 s : Average mflops/s per call (full) : 1.0519e+07
|
||||
Grid : Message : 35.862002 s : Average mflops/s per call per rank (full): 657438
|
||||
Grid : Message : 35.862003 s : Average mflops/s per call per node (full): 2.62975e+06
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 39.599406 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 39.599421 s : Called DwDag
|
||||
Grid : Message : 39.599422 s : norm dag result 12.0422
|
||||
Grid : Message : 39.604317 s : norm dag ref 12.0422
|
||||
Grid : Message : 39.609961 s : norm dag diff 7.6086e-14
|
||||
Grid : Message : 39.624145 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 39.772334 s : src_e0.499992
|
||||
Grid : Message : 39.795705 s : src_o0.500008
|
||||
Grid : Message : 39.816822 s : *********************************************************
|
||||
Grid : Message : 39.816824 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 39.816825 s : * Vectorising space-time by 8
|
||||
Grid : Message : 39.816826 s : * SINGLE precision
|
||||
Grid : Message : 39.816827 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 39.816828 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 39.816828 s : *********************************************************
|
||||
Grid : Message : 56.382758 s : Deo mflop/s = 1.017e+07
|
||||
Grid : Message : 56.382779 s : Deo mflop/s per rank 635627
|
||||
Grid : Message : 56.382781 s : Deo mflop/s per node 2.54251e+06
|
||||
Grid : Message : 56.382783 s : #### Dhop calls report
|
||||
Grid : Message : 56.382784 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 56.382785 s : WilsonFermion5D TotalTime /Calls : 5519.98 us
|
||||
Grid : Message : 56.382786 s : WilsonFermion5D CommTime /Calls : 4856.39 us
|
||||
Grid : Message : 56.382787 s : WilsonFermion5D FaceTime /Calls : 303.043 us
|
||||
Grid : Message : 56.382788 s : WilsonFermion5D ComputeTime1/Calls : 6.77807 us
|
||||
Grid : Message : 56.382789 s : WilsonFermion5D ComputeTime2/Calls : 376.551 us
|
||||
Grid : Message : 56.382810 s : Average mflops/s per call : 8.31124e+09
|
||||
Grid : Message : 56.382811 s : Average mflops/s per call per rank : 5.19453e+08
|
||||
Grid : Message : 56.382812 s : Average mflops/s per call per node : 2.07781e+09
|
||||
Grid : Message : 56.382813 s : Average mflops/s per call (full) : 1.03399e+07
|
||||
Grid : Message : 56.382814 s : Average mflops/s per call per rank (full): 646244
|
||||
Grid : Message : 56.382815 s : Average mflops/s per call per node (full): 2.58498e+06
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 56.414571 s : r_e6.02129
|
||||
Grid : Message : 56.417837 s : r_o6.02097
|
||||
Grid : Message : 56.420535 s : res12.0423
|
||||
Grid : Message : 56.611957 s : norm diff 0
|
||||
Grid : Message : 56.730597 s : norm diff even 0
|
||||
Grid : Message : 56.752566 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt0
Normal file
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt0
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fd460000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.667601 s : Grid Layout
|
||||
Grid : Message : 0.667602 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.667610 s : OpenMP threads : 32
|
||||
Grid : Message : 0.667611 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.702872 s : Making s innermost grids
|
||||
Grid : Message : 0.742911 s : Initialising 4d RNG
|
||||
Grid : Message : 0.813463 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.813479 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.922630 s : Initialising 5d RNG
|
||||
Grid : Message : 2.306290 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 2.306540 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 3.878430 s : Initialised RNGs
|
||||
Grid : Message : 4.536926 s : Drawing gauge field
|
||||
Grid : Message : 4.824391 s : Random gauge initialised
|
||||
Grid : Message : 6.253195 s : Setting up Cshift based reference
|
||||
Grid : Message : 7.326402 s : *****************************************************************
|
||||
Grid : Message : 7.326411 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 7.326412 s : *****************************************************************
|
||||
Grid : Message : 7.326412 s : *****************************************************************
|
||||
Grid : Message : 7.326412 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 7.326412 s : * Vectorising space-time by 8
|
||||
Grid : Message : 7.326413 s : * VComplexF size is 64 B
|
||||
Grid : Message : 7.326414 s : * SINGLE precision
|
||||
Grid : Message : 7.326414 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 7.326414 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 7.326414 s : *****************************************************************
|
||||
Grid : Message : 8.283417 s : Called warmup
|
||||
Grid : Message : 89.658859 s : Called Dw 3000 times in 8.13753e+07 us
|
||||
Grid : Message : 89.658898 s : mflop/s = 1.3063e+07
|
||||
Grid : Message : 89.658900 s : mflop/s per rank = 816437
|
||||
Grid : Message : 89.658901 s : mflop/s per node = 3.26575e+06
|
||||
Grid : Message : 89.658902 s : RF GiB/s (base 2) = 26543.7
|
||||
Grid : Message : 89.658903 s : mem GiB/s (base 2) = 16589.8
|
||||
Grid : Message : 89.662424 s : norm diff 1.03481e-13
|
||||
Grid : Message : 89.700433 s : #### Dhop calls report
|
||||
Grid : Message : 89.700452 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 89.700456 s : WilsonFermion5D TotalTime /Calls : 13588.2 us
|
||||
Grid : Message : 89.700457 s : WilsonFermion5D CommTime /Calls : 12137.3 us
|
||||
Grid : Message : 89.700458 s : WilsonFermion5D FaceTime /Calls : 548.408 us
|
||||
Grid : Message : 89.700459 s : WilsonFermion5D ComputeTime1/Calls : 42.6163 us
|
||||
Grid : Message : 89.700460 s : WilsonFermion5D ComputeTime2/Calls : 910.312 us
|
||||
Grid : Message : 89.700477 s : Average mflops/s per call : 4.43502e+09
|
||||
Grid : Message : 89.700493 s : Average mflops/s per call per rank : 2.77189e+08
|
||||
Grid : Message : 89.700494 s : Average mflops/s per call per node : 1.10875e+09
|
||||
Grid : Message : 89.700495 s : Average mflops/s per call (full) : 1.32753e+07
|
||||
Grid : Message : 89.700496 s : Average mflops/s per call per rank (full): 829709
|
||||
Grid : Message : 89.700497 s : Average mflops/s per call per node (full): 3.31884e+06
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 89.700498 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 89.700499 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 101.462401 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 101.462412 s : Called DwDag
|
||||
Grid : Message : 101.462413 s : norm dag result 12.0421
|
||||
Grid : Message : 101.474097 s : norm dag ref 12.0421
|
||||
Grid : Message : 101.489396 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 101.529094 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 101.996820 s : src_e0.499997
|
||||
Grid : Message : 102.626690 s : src_o0.500003
|
||||
Grid : Message : 102.125734 s : *********************************************************
|
||||
Grid : Message : 102.125736 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 102.125737 s : * Vectorising space-time by 8
|
||||
Grid : Message : 102.125738 s : * SINGLE precision
|
||||
Grid : Message : 102.125739 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 102.125739 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 102.125739 s : *********************************************************
|
||||
Grid : Message : 143.296910 s : Deo mflop/s = 1.30119e+07
|
||||
Grid : Message : 143.297140 s : Deo mflop/s per rank 813244
|
||||
Grid : Message : 143.297160 s : Deo mflop/s per node 3.25297e+06
|
||||
Grid : Message : 143.297180 s : #### Dhop calls report
|
||||
Grid : Message : 143.297190 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 143.297200 s : WilsonFermion5D TotalTime /Calls : 13630 us
|
||||
Grid : Message : 143.297210 s : WilsonFermion5D CommTime /Calls : 12124.9 us
|
||||
Grid : Message : 143.297220 s : WilsonFermion5D FaceTime /Calls : 590.958 us
|
||||
Grid : Message : 143.297230 s : WilsonFermion5D ComputeTime1/Calls : 43.2806 us
|
||||
Grid : Message : 143.297240 s : WilsonFermion5D ComputeTime2/Calls : 921.187 us
|
||||
Grid : Message : 143.297460 s : Average mflops/s per call : 4.24329e+09
|
||||
Grid : Message : 143.297470 s : Average mflops/s per call per rank : 2.65206e+08
|
||||
Grid : Message : 143.297480 s : Average mflops/s per call per node : 1.06082e+09
|
||||
Grid : Message : 143.297490 s : Average mflops/s per call (full) : 1.32347e+07
|
||||
Grid : Message : 143.297500 s : Average mflops/s per call per rank (full): 827169
|
||||
Grid : Message : 143.297510 s : Average mflops/s per call per node (full): 3.30868e+06
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 143.112368 s : r_e6.02111
|
||||
Grid : Message : 143.119760 s : r_o6.02102
|
||||
Grid : Message : 143.126239 s : res12.0421
|
||||
Grid : Message : 143.720780 s : norm diff 0
|
||||
Grid : Message : 144.885380 s : norm diff even 0
|
||||
Grid : Message : 144.154396 s : norm diff odd 0
|
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt1
Normal file
156
systems/Perlmutter/dwf.64.64.64.64.4node.opt1
Normal file
@ -0,0 +1,156 @@
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 42506321920
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 2
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7f4b80000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 34005057536 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 0.648397 s : Grid Layout
|
||||
Grid : Message : 0.648398 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.648401 s : OpenMP threads : 32
|
||||
Grid : Message : 0.648402 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.663662 s : Making s innermost grids
|
||||
Grid : Message : 0.682145 s : Initialising 4d RNG
|
||||
Grid : Message : 0.754321 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.754332 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.863265 s : Initialising 5d RNG
|
||||
Grid : Message : 1.967677 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.967691 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 2.921676 s : Initialised RNGs
|
||||
Grid : Message : 4.382384 s : Drawing gauge field
|
||||
Grid : Message : 4.672590 s : Random gauge initialised
|
||||
Grid : Message : 6.102697 s : Setting up Cshift based reference
|
||||
Grid : Message : 7.185897 s : *****************************************************************
|
||||
Grid : Message : 7.185906 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 7.185907 s : *****************************************************************
|
||||
Grid : Message : 7.185907 s : *****************************************************************
|
||||
Grid : Message : 7.185907 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 7.185907 s : * Vectorising space-time by 8
|
||||
Grid : Message : 7.185908 s : * VComplexF size is 64 B
|
||||
Grid : Message : 7.185909 s : * SINGLE precision
|
||||
Grid : Message : 7.185909 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 7.185909 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 7.185909 s : *****************************************************************
|
||||
Grid : Message : 8.114241 s : Called warmup
|
||||
Grid : Message : 83.988100 s : Called Dw 3000 times in 7.48954e+07 us
|
||||
Grid : Message : 83.992400 s : mflop/s = 1.41932e+07
|
||||
Grid : Message : 83.992600 s : mflop/s per rank = 887074
|
||||
Grid : Message : 83.992700 s : mflop/s per node = 3.5483e+06
|
||||
Grid : Message : 83.992800 s : RF GiB/s (base 2) = 28840.2
|
||||
Grid : Message : 83.992900 s : mem GiB/s (base 2) = 18025.1
|
||||
Grid : Message : 83.134870 s : norm diff 1.03481e-13
|
||||
Grid : Message : 83.493960 s : #### Dhop calls report
|
||||
Grid : Message : 83.494000 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 83.494030 s : WilsonFermion5D TotalTime /Calls : 12506 us
|
||||
Grid : Message : 83.494040 s : WilsonFermion5D CommTime /Calls : 11071.5 us
|
||||
Grid : Message : 83.494050 s : WilsonFermion5D FaceTime /Calls : 530.971 us
|
||||
Grid : Message : 83.494060 s : WilsonFermion5D ComputeTime1/Calls : 23.6428 us
|
||||
Grid : Message : 83.494070 s : WilsonFermion5D ComputeTime2/Calls : 911.864 us
|
||||
Grid : Message : 83.494220 s : Average mflops/s per call : 7.6108e+09
|
||||
Grid : Message : 83.494250 s : Average mflops/s per call per rank : 4.75675e+08
|
||||
Grid : Message : 83.494260 s : Average mflops/s per call per node : 1.9027e+09
|
||||
Grid : Message : 83.494270 s : Average mflops/s per call (full) : 1.44242e+07
|
||||
Grid : Message : 83.494280 s : Average mflops/s per call per rank (full): 901513
|
||||
Grid : Message : 83.494290 s : Average mflops/s per call per node (full): 3.60605e+06
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 94.600488 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 94.600501 s : Called DwDag
|
||||
Grid : Message : 94.600502 s : norm dag result 12.0421
|
||||
Grid : Message : 94.613445 s : norm dag ref 12.0421
|
||||
Grid : Message : 94.628514 s : norm dag diff 7.63236e-14
|
||||
Grid : Message : 94.666370 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 95.136361 s : src_e0.499997
|
||||
Grid : Message : 95.208108 s : src_o0.500003
|
||||
Grid : Message : 95.271511 s : *********************************************************
|
||||
Grid : Message : 95.271512 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 95.271513 s : * Vectorising space-time by 8
|
||||
Grid : Message : 95.271514 s : * SINGLE precision
|
||||
Grid : Message : 95.271514 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 95.271515 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 95.271515 s : *********************************************************
|
||||
Grid : Message : 132.766274 s : Deo mflop/s = 1.41952e+07
|
||||
Grid : Message : 132.766295 s : Deo mflop/s per rank 887201
|
||||
Grid : Message : 132.766297 s : Deo mflop/s per node 3.5488e+06
|
||||
Grid : Message : 132.766299 s : #### Dhop calls report
|
||||
Grid : Message : 132.766300 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 132.766301 s : WilsonFermion5D TotalTime /Calls : 12493.9 us
|
||||
Grid : Message : 132.766302 s : WilsonFermion5D CommTime /Calls : 10990.2 us
|
||||
Grid : Message : 132.766303 s : WilsonFermion5D FaceTime /Calls : 604.889 us
|
||||
Grid : Message : 132.766304 s : WilsonFermion5D ComputeTime1/Calls : 13.7158 us
|
||||
Grid : Message : 132.766305 s : WilsonFermion5D ComputeTime2/Calls : 920.096 us
|
||||
Grid : Message : 132.766326 s : Average mflops/s per call : 1.31121e+10
|
||||
Grid : Message : 132.766328 s : Average mflops/s per call per rank : 8.19504e+08
|
||||
Grid : Message : 132.766329 s : Average mflops/s per call per node : 3.27802e+09
|
||||
Grid : Message : 132.766330 s : Average mflops/s per call (full) : 1.44381e+07
|
||||
Grid : Message : 132.766331 s : Average mflops/s per call per rank (full): 902382
|
||||
Grid : Message : 132.766332 s : Average mflops/s per call per node (full): 3.60953e+06
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 132.847999 s : r_e6.02111
|
||||
Grid : Message : 132.854237 s : r_o6.02102
|
||||
Grid : Message : 132.860309 s : res12.0421
|
||||
Grid : Message : 133.458462 s : norm diff 0
|
||||
Grid : Message : 133.832713 s : norm diff even 0
|
||||
Grid : Message : 133.909147 s : norm diff odd 0
|
24
systems/Perlmutter/dwf4.slurm
Normal file
24
systems/Perlmutter/dwf4.slurm
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#SBATCH -A mp13
|
||||
#SBATCH -C gpu
|
||||
#SBATCH -q regular
|
||||
#SBATCH -t 0:20:00
|
||||
#SBATCH -n 16
|
||||
#SBATCH --ntasks-per-node=4
|
||||
#SBATCH -c 32
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --gpus-per-task=1
|
||||
#SBATCH --gpu-bind=map_gpu:0,1,2,3
|
||||
|
||||
export SLURM_CPU_BIND="cores"
|
||||
export MPICH_RDMA_ENABLED_CUDA=1
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2 --accelerator-threads 8 > comms.4node
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent --shm-mpi 0"
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0
|
||||
|
||||
OPT="--comms-overlap --comms-concurrent --shm-mpi 1"
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1
|
||||
srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1
|
4
systems/Perlmutter/sourceme.sh
Normal file
4
systems/Perlmutter/sourceme.sh
Normal file
@ -0,0 +1,4 @@
|
||||
|
||||
export CRAY_ACCEL_TARGET=nvidia80
|
||||
|
||||
module load PrgEnv-gnu cpe-cuda cuda
|
26
systems/Spock/comms.slurm
Normal file
26
systems/Spock/comms.slurm
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -p ecp
|
||||
#SBATCH -J comms
|
||||
#SBATCH -o comms.%J
|
||||
#SBATCH -e comms.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 2
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
export OMP_NUM_THREADS=8
|
||||
|
||||
AT=8
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads ${AT} --grid 64.64.32.32 --mpi 2.1.1.1 "
|
||||
srun -n2 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_comms_host_device $PARAMS
|
||||
|
12
systems/Spock/config-command
Normal file
12
systems/Spock/config-command
Normal file
@ -0,0 +1,12 @@
|
||||
../../configure --enable-comms=mpi-auto \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-simd=GPU \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I/opt/rocm-4.3.0/include/ -std=c++14 -I${MPICH_DIR}/include " \
|
||||
--prefix=/ccs/home/chulwoo/Grid \
|
||||
LDFLAGS=" -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa "
|
26
systems/Spock/dwf.slurm
Normal file
26
systems/Spock/dwf.slurm
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -p ecp
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 1
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=8
|
||||
|
||||
AT=8
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads ${AT} --grid 32.32.32.32 --mpi 1.1.1.1 --comms-overlap"
|
||||
srun -n1 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
26
systems/Spock/dwf4.slurm
Normal file
26
systems/Spock/dwf4.slurm
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -p ecp
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 1
|
||||
#SBATCH -n 4
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=8
|
||||
|
||||
AT=8
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads ${AT} --grid 32.32.64.64 --mpi 1.1.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
srun -n4 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
26
systems/Spock/dwf8.slurm
Normal file
26
systems/Spock/dwf8.slurm
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
# Begin LSF Directives
|
||||
#SBATCH -A LGT104
|
||||
#SBATCH -t 01:00:00
|
||||
##SBATCH -U openmpThu
|
||||
#SBATCH -p ecp
|
||||
#SBATCH -J DWF
|
||||
#SBATCH -o DWF.%J
|
||||
#SBATCH -e DWF.%J
|
||||
#SBATCH -N 2
|
||||
#SBATCH -n 8
|
||||
|
||||
DIR=.
|
||||
module list
|
||||
export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=NONE
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=CMA
|
||||
export OMP_NUM_THREADS=8
|
||||
|
||||
AT=8
|
||||
echo MPICH_SMP_SINGLE_COPY_MODE $MPICH_SMP_SINGLE_COPY_MODE
|
||||
PARAMS=" --accelerator-threads ${AT} --grid 32.64.64.64 --mpi 1.2.2.2 --comms-overlap --shm 2048 --shm-mpi 0"
|
||||
srun -n8 --label -c$OMP_NUM_THREADS --gpus-per-task=1 ./mpiwrapper.sh ./benchmarks/Benchmark_dwf_fp32 $PARAMS
|
||||
|
12
systems/Spock/mpiwrapper.sh
Executable file
12
systems/Spock/mpiwrapper.sh
Executable file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
lrank=$SLURM_LOCALID
|
||||
|
||||
export ROCR_VISIBLE_DEVICES=$SLURM_LOCALID
|
||||
|
||||
echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES binding=$BINDING"
|
||||
|
||||
$*
|
||||
|
||||
|
||||
|
5
systems/Spock/sourceme.sh
Normal file
5
systems/Spock/sourceme.sh
Normal file
@ -0,0 +1,5 @@
|
||||
module load PrgEnv-gnu
|
||||
module load rocm/4.3.0
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx908
|
179
systems/Summit/comms.4node
Normal file
179
systems/Summit/comms.4node
Normal file
@ -0,0 +1,179 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 16911433728
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 4
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: rank 0 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 0 device 0 bus id: 0004:04:00.0
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 24
|
||||
SharedMemoryMpi: Node communicator of size 6
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x200060000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 1073741824 byte stencil comms buffers
|
||||
AcceleratorCudaInit: rank 1 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 1 device 1 bus id: 0004:05:00.0
|
||||
AcceleratorCudaInit: rank 2 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 2 device 2 bus id: 0004:06:00.0
|
||||
AcceleratorCudaInit: rank 5 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 5 device 5 bus id: 0035:05:00.0
|
||||
AcceleratorCudaInit: rank 4 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 4 device 4 bus id: 0035:04:00.0
|
||||
AcceleratorCudaInit: rank 3 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 3 device 3 bus id: 0035:03:00.0
|
||||
Grid : Message : MemoryManager Cache 13529146982 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 2.137929 s : Grid is setup to use 6 threads
|
||||
Grid : Message : 2.137941 s : Number of iterations to average: 250
|
||||
Grid : Message : 2.137950 s : ====================================================================================================
|
||||
Grid : Message : 2.137958 s : = Benchmarking sequential halo exchange from host memory
|
||||
Grid : Message : 2.137966 s : ====================================================================================================
|
||||
Grid : Message : 2.137974 s : L Ls bytes MB/s uni MB/s bidi
|
||||
AcceleratorCudaInit: rank 22 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 10 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 15 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 21 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 20 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 7 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 9 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 11 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 8 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 6 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 19 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 23 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 18 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 12 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 16 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 13 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 14 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 17 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
Grid : Message : 2.604949 s : 8 8 393216 89973.9 179947.8
|
||||
Grid : Message : 2.668249 s : 8 8 393216 18650.3 37300.5
|
||||
Grid : Message : 2.732288 s : 8 8 393216 18428.5 36857.1
|
||||
Grid : Message : 2.753565 s : 8 8 393216 55497.2 110994.4
|
||||
Grid : Message : 2.808960 s : 12 8 1327104 100181.5 200363.0
|
||||
Grid : Message : 3.226900 s : 12 8 1327104 20600.5 41201.0
|
||||
Grid : Message : 3.167459 s : 12 8 1327104 24104.6 48209.2
|
||||
Grid : Message : 3.227660 s : 12 8 1327104 66156.7 132313.5
|
||||
Grid : Message : 3.413570 s : 16 8 3145728 56174.4 112348.8
|
||||
Grid : Message : 3.802697 s : 16 8 3145728 24255.9 48511.7
|
||||
Grid : Message : 4.190498 s : 16 8 3145728 24336.7 48673.4
|
||||
Grid : Message : 4.385171 s : 16 8 3145728 48484.1 96968.2
|
||||
Grid : Message : 4.805284 s : 20 8 6144000 46380.5 92761.1
|
||||
Grid : Message : 5.562975 s : 20 8 6144000 24328.5 48656.9
|
||||
Grid : Message : 6.322562 s : 20 8 6144000 24266.7 48533.4
|
||||
Grid : Message : 6.773598 s : 20 8 6144000 40868.5 81736.9
|
||||
Grid : Message : 7.600999 s : 24 8 10616832 40198.3 80396.6
|
||||
Grid : Message : 8.912917 s : 24 8 10616832 24279.5 48559.1
|
||||
Grid : Message : 10.220961 s : 24 8 10616832 24350.2 48700.4
|
||||
Grid : Message : 11.728250 s : 24 8 10616832 37390.9 74781.8
|
||||
Grid : Message : 12.497258 s : 28 8 16859136 36792.2 73584.5
|
||||
Grid : Message : 14.585387 s : 28 8 16859136 24222.2 48444.3
|
||||
Grid : Message : 16.664783 s : 28 8 16859136 24323.4 48646.8
|
||||
Grid : Message : 17.955238 s : 28 8 16859136 39194.7 78389.4
|
||||
Grid : Message : 20.136479 s : 32 8 25165824 35718.3 71436.5
|
||||
Grid : Message : 23.241958 s : 32 8 25165824 24311.4 48622.9
|
||||
Grid : Message : 26.344810 s : 32 8 25165824 24331.9 48663.7
|
||||
Grid : Message : 28.384420 s : 32 8 25165824 37016.3 74032.7
|
||||
Grid : Message : 28.388879 s : ====================================================================================================
|
||||
Grid : Message : 28.388894 s : = Benchmarking sequential halo exchange from GPU memory
|
||||
Grid : Message : 28.388909 s : ====================================================================================================
|
||||
Grid : Message : 28.388924 s : L Ls bytes MB/s uni MB/s bidi
|
||||
Grid : Message : 28.553993 s : 8 8 393216 8272.4 16544.7
|
||||
Grid : Message : 28.679592 s : 8 8 393216 9395.4 18790.8
|
||||
Grid : Message : 28.811112 s : 8 8 393216 8971.0 17942.0
|
||||
Grid : Message : 28.843770 s : 8 8 393216 36145.6 72291.2
|
||||
Grid : Message : 28.981754 s : 12 8 1327104 49591.6 99183.2
|
||||
Grid : Message : 29.299764 s : 12 8 1327104 12520.8 25041.7
|
||||
Grid : Message : 29.620288 s : 12 8 1327104 12422.2 24844.4
|
||||
Grid : Message : 29.657645 s : 12 8 1327104 106637.5 213275.1
|
||||
Grid : Message : 29.952933 s : 16 8 3145728 43939.2 87878.5
|
||||
Grid : Message : 30.585411 s : 16 8 3145728 14922.1 29844.2
|
||||
Grid : Message : 31.219781 s : 16 8 3145728 14877.2 29754.4
|
||||
Grid : Message : 31.285017 s : 16 8 3145728 144724.3 289448.7
|
||||
Grid : Message : 31.706443 s : 20 8 6144000 54676.2 109352.4
|
||||
Grid : Message : 32.739205 s : 20 8 6144000 17848.0 35696.1
|
||||
Grid : Message : 33.771852 s : 20 8 6144000 17849.9 35699.7
|
||||
Grid : Message : 33.871981 s : 20 8 6144000 184141.4 368282.8
|
||||
Grid : Message : 34.536808 s : 24 8 10616832 55784.3 111568.6
|
||||
Grid : Message : 36.275648 s : 24 8 10616832 18317.6 36635.3
|
||||
Grid : Message : 37.997181 s : 24 8 10616832 18501.7 37003.4
|
||||
Grid : Message : 38.140442 s : 24 8 10616832 222383.9 444767.9
|
||||
Grid : Message : 39.177222 s : 28 8 16859136 56609.7 113219.4
|
||||
Grid : Message : 41.874755 s : 28 8 16859136 18749.9 37499.8
|
||||
Grid : Message : 44.529381 s : 28 8 16859136 19052.9 38105.8
|
||||
Grid : Message : 44.742192 s : 28 8 16859136 237717.1 475434.2
|
||||
Grid : Message : 46.184000 s : 32 8 25165824 57091.2 114182.4
|
||||
Grid : Message : 50.734740 s : 32 8 25165824 19411.0 38821.9
|
||||
Grid : Message : 53.931228 s : 32 8 25165824 19570.6 39141.2
|
||||
Grid : Message : 54.238467 s : 32 8 25165824 245765.6 491531.2
|
||||
Grid : Message : 54.268664 s : ====================================================================================================
|
||||
Grid : Message : 54.268680 s : = All done; Bye Bye
|
||||
Grid : Message : 54.268691 s : ====================================================================================================
|
14
systems/Summit/config-command
Normal file
14
systems/Summit/config-command
Normal file
@ -0,0 +1,14 @@
|
||||
../../configure --enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=32 \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--disable-gparity \
|
||||
--enable-setdevice \
|
||||
--disable-fermion-reps \
|
||||
--enable-accelerator=cuda \
|
||||
--prefix /ccs/home/paboyle/prefix \
|
||||
CXX=nvcc \
|
||||
LDFLAGS=-L/ccs/home/paboyle/prefix/lib/ \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_70,code=sm_70 -I/ccs/home/paboyle/prefix/include/ -std=c++14"
|
||||
|
206
systems/Summit/dwf.24.4node
Normal file
206
systems/Summit/dwf.24.4node
Normal file
@ -0,0 +1,206 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 16911433728
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 4
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: rank 0 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 0 device 0 bus id: 0004:04:00.0
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 24
|
||||
SharedMemoryMpi: Node communicator of size 6
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers
|
||||
AcceleratorCudaInit: rank 3 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 3 device 3 bus id: 0035:03:00.0
|
||||
AcceleratorCudaInit: rank 5 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 5 device 5 bus id: 0035:05:00.0
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
AcceleratorCudaInit: rank 4 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 4 device 4 bus id: 0035:04:00.0
|
||||
AcceleratorCudaInit: rank 1 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 1 device 1 bus id: 0004:05:00.0
|
||||
AcceleratorCudaInit: rank 2 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 2 device 2 bus id: 0004:06:00.0
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 8388608000 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 1.731905 s : Grid Layout
|
||||
Grid : Message : 1.731915 s : Global lattice size : 48 48 48 72
|
||||
Grid : Message : 1.731928 s : OpenMP threads : 6
|
||||
Grid : Message : 1.731938 s : MPI tasks : 2 2 2 3
|
||||
AcceleratorCudaInit: rank 9 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 23 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 22 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 21 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 18 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 6 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 7 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 10 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 8 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 11 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 20 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 19 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 13 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 12 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 14 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 16 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 15 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 17 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
Grid : Message : 2.683494 s : Making s innermost grids
|
||||
Grid : Message : 2.780034 s : Initialising 4d RNG
|
||||
Grid : Message : 2.833099 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 2.833121 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 2.916841 s : Initialising 5d RNG
|
||||
Grid : Message : 3.762880 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 3.762902 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 5.264345 s : Initialised RNGs
|
||||
Grid : Message : 6.489904 s : Drawing gauge field
|
||||
Grid : Message : 6.729262 s : Random gauge initialised
|
||||
Grid : Message : 7.781273 s : Setting up Cshift based reference
|
||||
Grid : Message : 8.725313 s : *****************************************************************
|
||||
Grid : Message : 8.725332 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 8.725342 s : *****************************************************************
|
||||
Grid : Message : 8.725352 s : *****************************************************************
|
||||
Grid : Message : 8.725362 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 8.725372 s : * Vectorising space-time by 4
|
||||
Grid : Message : 8.725383 s : * VComplexF size is 32 B
|
||||
Grid : Message : 8.725395 s : * SINGLE precision
|
||||
Grid : Message : 8.725405 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 8.725415 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 8.725425 s : *****************************************************************
|
||||
Grid : Message : 9.465229 s : Called warmup
|
||||
Grid : Message : 58.646066 s : Called Dw 3000 times in 4.91764e+07 us
|
||||
Grid : Message : 58.646121 s : mflop/s = 1.02592e+07
|
||||
Grid : Message : 58.646134 s : mflop/s per rank = 427468
|
||||
Grid : Message : 58.646145 s : mflop/s per node = 2.56481e+06
|
||||
Grid : Message : 58.646156 s : RF GiB/s (base 2) = 20846.5
|
||||
Grid : Message : 58.646166 s : mem GiB/s (base 2) = 13029.1
|
||||
Grid : Message : 58.648008 s : norm diff 1.04778e-13
|
||||
Grid : Message : 58.734885 s : #### Dhop calls report
|
||||
Grid : Message : 58.734897 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 58.734909 s : WilsonFermion5D TotalTime /Calls : 8217.71 us
|
||||
Grid : Message : 58.734922 s : WilsonFermion5D CommTime /Calls : 7109.5 us
|
||||
Grid : Message : 58.734933 s : WilsonFermion5D FaceTime /Calls : 446.623 us
|
||||
Grid : Message : 58.734943 s : WilsonFermion5D ComputeTime1/Calls : 18.0558 us
|
||||
Grid : Message : 58.734953 s : WilsonFermion5D ComputeTime2/Calls : 731.097 us
|
||||
Grid : Message : 58.734979 s : Average mflops/s per call : 4.8157e+09
|
||||
Grid : Message : 58.734989 s : Average mflops/s per call per rank : 2.00654e+08
|
||||
Grid : Message : 58.734999 s : Average mflops/s per call per node : 1.20393e+09
|
||||
Grid : Message : 58.735008 s : Average mflops/s per call (full) : 1.04183e+07
|
||||
Grid : Message : 58.735017 s : Average mflops/s per call per rank (full): 434094
|
||||
Grid : Message : 58.735026 s : Average mflops/s per call per node (full): 2.60456e+06
|
||||
Grid : Message : 58.735035 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 58.735043 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 58.735051 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 58.735059 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 58.735067 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 58.735075 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 64.934380 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 64.934740 s : Called DwDag
|
||||
Grid : Message : 64.934870 s : norm dag result 12.0422
|
||||
Grid : Message : 64.120756 s : norm dag ref 12.0422
|
||||
Grid : Message : 64.149389 s : norm dag diff 7.6644e-14
|
||||
Grid : Message : 64.317786 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 64.465331 s : src_e0.499995
|
||||
Grid : Message : 64.524653 s : src_o0.500005
|
||||
Grid : Message : 64.558706 s : *********************************************************
|
||||
Grid : Message : 64.558717 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 64.558727 s : * Vectorising space-time by 4
|
||||
Grid : Message : 64.558737 s : * SINGLE precision
|
||||
Grid : Message : 64.558745 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 64.558753 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 64.558761 s : *********************************************************
|
||||
Grid : Message : 92.702145 s : Deo mflop/s = 8.97692e+06
|
||||
Grid : Message : 92.702185 s : Deo mflop/s per rank 374038
|
||||
Grid : Message : 92.702198 s : Deo mflop/s per node 2.24423e+06
|
||||
Grid : Message : 92.702209 s : #### Dhop calls report
|
||||
Grid : Message : 92.702223 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 92.702240 s : WilsonFermion5D TotalTime /Calls : 9377.88 us
|
||||
Grid : Message : 92.702257 s : WilsonFermion5D CommTime /Calls : 8221.84 us
|
||||
Grid : Message : 92.702277 s : WilsonFermion5D FaceTime /Calls : 543.548 us
|
||||
Grid : Message : 92.702301 s : WilsonFermion5D ComputeTime1/Calls : 20.936 us
|
||||
Grid : Message : 92.702322 s : WilsonFermion5D ComputeTime2/Calls : 732.33 us
|
||||
Grid : Message : 92.702376 s : Average mflops/s per call : 4.13001e+09
|
||||
Grid : Message : 92.702387 s : Average mflops/s per call per rank : 1.72084e+08
|
||||
Grid : Message : 92.702397 s : Average mflops/s per call per node : 1.0325e+09
|
||||
Grid : Message : 92.702407 s : Average mflops/s per call (full) : 9.12937e+06
|
||||
Grid : Message : 92.702416 s : Average mflops/s per call per rank (full): 380391
|
||||
Grid : Message : 92.702426 s : Average mflops/s per call per node (full): 2.28234e+06
|
||||
Grid : Message : 92.702435 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 92.702443 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 92.702451 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 92.702459 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 92.702467 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 92.702475 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 92.772983 s : r_e6.02121
|
||||
Grid : Message : 92.786384 s : r_o6.02102
|
||||
Grid : Message : 92.799622 s : res12.0422
|
||||
Grid : Message : 93.860500 s : norm diff 0
|
||||
Grid : Message : 93.162026 s : norm diff even 0
|
||||
Grid : Message : 93.197529 s : norm diff odd 0
|
206
systems/Summit/dwf.32.4node
Normal file
206
systems/Summit/dwf.32.4node
Normal file
@ -0,0 +1,206 @@
|
||||
OPENMPI detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: Tesla V100-SXM2-16GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 16911433728
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 4
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: rank 0 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 0 device 0 bus id: 0004:04:00.0
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 24
|
||||
SharedMemoryMpi: Node communicator of size 6
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x200080000000 for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
AcceleratorCudaInit: rank 2 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 2 device 2 bus id: 0004:06:00.0
|
||||
AcceleratorCudaInit: rank 1 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 1 device 1 bus id: 0004:05:00.0
|
||||
AcceleratorCudaInit: rank 4 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 4 device 4 bus id: 0035:04:00.0
|
||||
AcceleratorCudaInit: rank 3 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 3 device 3 bus id: 0035:03:00.0
|
||||
AcceleratorCudaInit: rank 5 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
local rank 5 device 5 bus id: 0035:05:00.0
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=7cb1ff7395a5833ded6526c43891bd07a0436290: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 8388608000 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 8 LARGE 2
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
Grid : Message : 1.544984 s : Grid Layout
|
||||
Grid : Message : 1.544992 s : Global lattice size : 64 64 64 96
|
||||
Grid : Message : 1.545003 s : OpenMP threads : 6
|
||||
Grid : Message : 1.545011 s : MPI tasks : 2 2 2 3
|
||||
AcceleratorCudaInit: rank 8 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 6 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 11 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 16 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 17 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 13 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 12 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 21 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 23 setting device to node rank 5
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 22 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 19 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 18 setting device to node rank 0
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 7 setting device to node rank 1
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 10 setting device to node rank 4
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 9 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 14 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 15 setting device to node rank 3
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
AcceleratorCudaInit: rank 20 setting device to node rank 2
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=yes
|
||||
Grid : Message : 2.994920 s : Making s innermost grids
|
||||
Grid : Message : 2.232502 s : Initialising 4d RNG
|
||||
Grid : Message : 2.397047 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 2.397069 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 2.653140 s : Initialising 5d RNG
|
||||
Grid : Message : 5.285347 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 5.285369 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 9.994738 s : Initialised RNGs
|
||||
Grid : Message : 13.153426 s : Drawing gauge field
|
||||
Grid : Message : 13.825697 s : Random gauge initialised
|
||||
Grid : Message : 18.537657 s : Setting up Cshift based reference
|
||||
Grid : Message : 22.296755 s : *****************************************************************
|
||||
Grid : Message : 22.296781 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 22.296791 s : *****************************************************************
|
||||
Grid : Message : 22.296800 s : *****************************************************************
|
||||
Grid : Message : 22.296809 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 22.296818 s : * Vectorising space-time by 4
|
||||
Grid : Message : 22.296828 s : * VComplexF size is 32 B
|
||||
Grid : Message : 22.296838 s : * SINGLE precision
|
||||
Grid : Message : 22.296847 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 22.296855 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 22.296863 s : *****************************************************************
|
||||
Grid : Message : 24.746452 s : Called warmup
|
||||
Grid : Message : 137.525756 s : Called Dw 3000 times in 1.12779e+08 us
|
||||
Grid : Message : 137.525818 s : mflop/s = 1.41383e+07
|
||||
Grid : Message : 137.525831 s : mflop/s per rank = 589097
|
||||
Grid : Message : 137.525843 s : mflop/s per node = 3.53458e+06
|
||||
Grid : Message : 137.525854 s : RF GiB/s (base 2) = 28728.7
|
||||
Grid : Message : 137.525864 s : mem GiB/s (base 2) = 17955.5
|
||||
Grid : Message : 137.693645 s : norm diff 1.04885e-13
|
||||
Grid : Message : 137.965585 s : #### Dhop calls report
|
||||
Grid : Message : 137.965598 s : WilsonFermion5D Number of DhopEO Calls : 6002
|
||||
Grid : Message : 137.965612 s : WilsonFermion5D TotalTime /Calls : 18899.7 us
|
||||
Grid : Message : 137.965624 s : WilsonFermion5D CommTime /Calls : 16041.4 us
|
||||
Grid : Message : 137.965634 s : WilsonFermion5D FaceTime /Calls : 859.705 us
|
||||
Grid : Message : 137.965644 s : WilsonFermion5D ComputeTime1/Calls : 70.5881 us
|
||||
Grid : Message : 137.965654 s : WilsonFermion5D ComputeTime2/Calls : 2094.8 us
|
||||
Grid : Message : 137.965682 s : Average mflops/s per call : 3.87638e+09
|
||||
Grid : Message : 137.965692 s : Average mflops/s per call per rank : 1.61516e+08
|
||||
Grid : Message : 137.965702 s : Average mflops/s per call per node : 9.69095e+08
|
||||
Grid : Message : 137.965712 s : Average mflops/s per call (full) : 1.43168e+07
|
||||
Grid : Message : 137.965721 s : Average mflops/s per call per rank (full): 596533
|
||||
Grid : Message : 137.965730 s : Average mflops/s per call per node (full): 3.5792e+06
|
||||
Grid : Message : 137.965740 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 137.965748 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 137.965756 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 137.965764 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 137.965772 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 137.965780 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 156.554605 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 156.554632 s : Called DwDag
|
||||
Grid : Message : 156.554642 s : norm dag result 12.0421
|
||||
Grid : Message : 156.639265 s : norm dag ref 12.0421
|
||||
Grid : Message : 156.888281 s : norm dag diff 7.62057e-14
|
||||
Grid : Message : 157.609797 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 158.208630 s : src_e0.499996
|
||||
Grid : Message : 158.162447 s : src_o0.500004
|
||||
Grid : Message : 158.267780 s : *********************************************************
|
||||
Grid : Message : 158.267791 s : * Benchmarking DomainWallFermionF::DhopEO
|
||||
Grid : Message : 158.267801 s : * Vectorising space-time by 4
|
||||
Grid : Message : 158.267811 s : * SINGLE precision
|
||||
Grid : Message : 158.267820 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 158.267828 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 158.267836 s : *********************************************************
|
||||
Grid : Message : 216.487829 s : Deo mflop/s = 1.37283e+07
|
||||
Grid : Message : 216.487869 s : Deo mflop/s per rank 572011
|
||||
Grid : Message : 216.487881 s : Deo mflop/s per node 3.43206e+06
|
||||
Grid : Message : 216.487893 s : #### Dhop calls report
|
||||
Grid : Message : 216.487903 s : WilsonFermion5D Number of DhopEO Calls : 3001
|
||||
Grid : Message : 216.487913 s : WilsonFermion5D TotalTime /Calls : 19399.6 us
|
||||
Grid : Message : 216.487923 s : WilsonFermion5D CommTime /Calls : 16475.4 us
|
||||
Grid : Message : 216.487933 s : WilsonFermion5D FaceTime /Calls : 972.393 us
|
||||
Grid : Message : 216.487943 s : WilsonFermion5D ComputeTime1/Calls : 49.8474 us
|
||||
Grid : Message : 216.487953 s : WilsonFermion5D ComputeTime2/Calls : 2089.93 us
|
||||
Grid : Message : 216.488001 s : Average mflops/s per call : 5.39682e+09
|
||||
Grid : Message : 216.488011 s : Average mflops/s per call per rank : 2.24867e+08
|
||||
Grid : Message : 216.488020 s : Average mflops/s per call per node : 1.3492e+09
|
||||
Grid : Message : 216.488030 s : Average mflops/s per call (full) : 1.39479e+07
|
||||
Grid : Message : 216.488039 s : Average mflops/s per call per rank (full): 581162
|
||||
Grid : Message : 216.488048 s : Average mflops/s per call per node (full): 3.48697e+06
|
||||
Grid : Message : 216.488057 s : WilsonFermion5D Stencil
|
||||
Grid : Message : 216.488065 s : WilsonFermion5D StencilEven
|
||||
Grid : Message : 216.488073 s : WilsonFermion5D StencilOdd
|
||||
Grid : Message : 216.488081 s : WilsonFermion5D Stencil Reporti()
|
||||
Grid : Message : 216.488089 s : WilsonFermion5D StencilEven Reporti()
|
||||
Grid : Message : 216.488097 s : WilsonFermion5D StencilOdd Reporti()
|
||||
Grid : Message : 217.384495 s : r_e6.02113
|
||||
Grid : Message : 217.426121 s : r_o6.02096
|
||||
Grid : Message : 217.472636 s : res12.0421
|
||||
Grid : Message : 218.200068 s : norm diff 0
|
||||
Grid : Message : 218.645673 s : norm diff even 0
|
||||
Grid : Message : 218.816561 s : norm diff odd 0
|
25
systems/Summit/dwf16.lsf
Normal file
25
systems/Summit/dwf16.lsf
Normal file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
#BSUB -P LGT104
|
||||
#BSUB -W 2:00
|
||||
#BSUB -nnodes 16
|
||||
#BSUB -J DWF
|
||||
|
||||
export OMP_NUM_THREADS=6
|
||||
export PAMI_IBV_ADAPTER_AFFINITY=1
|
||||
export PAMI_ENABLE_STRIPING=1
|
||||
export OPT="--comms-concurrent --comms-overlap "
|
||||
|
||||
APP="./benchmarks/Benchmark_comms_host_device --mpi 4.4.4.3 "
|
||||
jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.16node.log
|
||||
|
||||
APP="./benchmarks/Benchmark_dwf_fp32 --grid 96.96.96.72 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||
jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.24.log
|
||||
|
||||
APP="./benchmarks/Benchmark_dwf_fp32 --grid 128.128.128.96 --mpi 4.4.4.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||
jsrun --nrs 16 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.16node.32.log
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
25
systems/Summit/dwf4.lsf
Normal file
25
systems/Summit/dwf4.lsf
Normal file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
#BSUB -P LGT104
|
||||
#BSUB -W 2:00
|
||||
#BSUB -nnodes 4
|
||||
#BSUB -J DWF
|
||||
|
||||
export OMP_NUM_THREADS=6
|
||||
export PAMI_IBV_ADAPTER_AFFINITY=1
|
||||
export PAMI_ENABLE_STRIPING=1
|
||||
export OPT="--comms-concurrent --comms-overlap "
|
||||
#export GRID_ALLOC_NCACHE_LARGE=1
|
||||
export APP="./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.3 "
|
||||
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > comms.4node
|
||||
|
||||
APP="./benchmarks/Benchmark_dwf_fp32 --grid 48.48.48.72 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.24.4node
|
||||
|
||||
APP="./benchmarks/Benchmark_dwf_fp32 --grid 64.64.64.96 --mpi 2.2.2.3 --shm 2048 --shm-force-mpi 1 --device-mem 8000 --shm-force-mpi 1 $OPT "
|
||||
jsrun --nrs 4 -a6 -g6 -c42 -dpacked -b packed:7 --latency_priority gpu-cpu --smpiargs=-gpu $APP > dwf.32.4node
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
8
systems/Summit/sourceme-cuda10.sh
Normal file
8
systems/Summit/sourceme-cuda10.sh
Normal file
@ -0,0 +1,8 @@
|
||||
export UCX_GDR_COPY_RCACHE=no
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
module load gcc/7.5.0
|
||||
module load cuda/10.2.89
|
||||
#cuda/11.4.0
|
||||
export LD_LIBRARY_PATH=/ccs/home/paboyle/prefix/lib/:$LD_LIBRARY_PATH
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user