mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 23:37:06 +01:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
@ -358,7 +358,7 @@ public:
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
auto& geom_v = geom;
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
@ -380,7 +380,7 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
for(int point=0;point<geom_v.npoint;point++){
|
||||
for(int point=0;point<npoint;point++){
|
||||
|
||||
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||
|
||||
@ -424,7 +424,7 @@ public:
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
auto& geom_v = geom;
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
@ -454,7 +454,7 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
for(int p=0;p<geom_v.npoint;p++){
|
||||
for(int p=0;p<npoint;p++){
|
||||
int point = points_p[p];
|
||||
|
||||
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||
|
@ -508,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||
virtual void MpcDag (const Field &in, Field &out){
|
||||
Mpc(in,out);
|
||||
}
|
||||
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
||||
virtual void MpcDagMpc(const Field &in, Field &out) {
|
||||
assert(0);// Never need with staggered
|
||||
}
|
||||
};
|
||||
@ -586,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
|
||||
template<typename Field>
|
||||
class PlainHermOp : public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
LinearOperatorBase<Field> &_Linop;
|
||||
|
||||
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
|
||||
@ -599,6 +600,7 @@ public:
|
||||
template<typename Field>
|
||||
class FunctionHermOp : public LinearFunction<Field> {
|
||||
public:
|
||||
using LinearFunction<Field>::operator();
|
||||
OperatorFunction<Field> & _poly;
|
||||
LinearOperatorBase<Field> &_Linop;
|
||||
|
||||
|
@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Field> class Preconditioner : public LinearFunction<Field> {
|
||||
template<class Field> using Preconditioner = LinearFunction<Field> ;
|
||||
|
||||
/*
|
||||
template<class Field> class Preconditioner : public LinearFunction<Field> {
|
||||
using LinearFunction<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field & psi)=0;
|
||||
};
|
||||
*/
|
||||
|
||||
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
|
||||
public:
|
||||
void operator()(const Field &src, Field & psi){
|
||||
using Preconditioner<Field>::operator();
|
||||
virtual void operator()(const Field &src, Field & psi){
|
||||
psi = src;
|
||||
}
|
||||
TrivialPrecon(void){};
|
||||
|
@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
|
||||
{
|
||||
public:
|
||||
public:
|
||||
using LinearFunction<FieldD>::operator();
|
||||
RealD Tolerance;
|
||||
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
|
||||
Integer MaxInnerIterations;
|
||||
|
@ -67,6 +67,7 @@ public:
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
|
||||
public:
|
||||
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
|
||||
typedef iVector<CComplex,nbasis > CoarseSiteVector;
|
||||
typedef Lattice<CoarseSiteVector> CoarseField;
|
||||
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
|
||||
@ -97,6 +98,7 @@ public:
|
||||
template<class Fobj,class CComplex,int nbasis>
|
||||
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
|
||||
public:
|
||||
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
|
||||
typedef iVector<CComplex,nbasis > CoarseSiteVector;
|
||||
typedef Lattice<CoarseSiteVector> CoarseField;
|
||||
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
|
||||
|
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class Field>
|
||||
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
|
||||
public:
|
||||
|
||||
using LinearFunction<Field>::operator();
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
int verbose;
|
||||
|
@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
template<class Field>
|
||||
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
|
||||
public:
|
||||
|
||||
using LinearFunction<Field>::operator();
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
int verbose;
|
||||
@ -119,7 +119,8 @@ public:
|
||||
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
|
||||
|
||||
RealD cp;
|
||||
ComplexD a, b, zAz;
|
||||
ComplexD a, b;
|
||||
// ComplexD zAz;
|
||||
RealD zAAz;
|
||||
ComplexD rq;
|
||||
|
||||
@ -146,7 +147,7 @@ public:
|
||||
//////////////////////////////////
|
||||
MatTimer.Start();
|
||||
Linop.Op(psi,Az);
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
MatTimer.Stop();
|
||||
|
||||
@ -170,7 +171,7 @@ public:
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
//p[0],q[0],qq[0]
|
||||
@ -212,7 +213,7 @@ public:
|
||||
MatTimer.Start();
|
||||
Linop.Op(z,Az);
|
||||
MatTimer.Stop();
|
||||
zAz = innerProduct(Az,psi);
|
||||
// zAz = innerProduct(Az,psi);
|
||||
zAAz= norm2(Az);
|
||||
|
||||
LinalgTimer.Start();
|
||||
|
@ -170,6 +170,7 @@ private:
|
||||
|
||||
public:
|
||||
static void Print(void);
|
||||
static void PrintState( void* CpuPtr);
|
||||
static int isOpen (void* CpuPtr);
|
||||
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||
|
@ -474,6 +474,32 @@ int MemoryManager::isOpen (void* _CpuPtr)
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::PrintState(void* _CpuPtr)
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
|
||||
if ( EntryPresent(CpuPtr) ){
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
std::string str;
|
||||
if ( AccCache.state==Empty ) str = std::string("Empty");
|
||||
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
|
||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
|
||||
|
||||
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t" << AccCache.cpuLock
|
||||
<< "\t" << AccCache.accLock
|
||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||
|
||||
} else {
|
||||
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
@ -16,6 +16,10 @@ uint64_t MemoryManager::DeviceToHostXfer;
|
||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
||||
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
||||
void MemoryManager::PrintState(void* CpuPtr)
|
||||
{
|
||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
|
||||
};
|
||||
void MemoryManager::Print(void){};
|
||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||
|
||||
|
@ -88,6 +88,13 @@ public:
|
||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||
accessor.ViewClose();
|
||||
}
|
||||
|
||||
// Helper function to print the state of this object in the AccCache
|
||||
void PrintCacheState(void)
|
||||
{
|
||||
MemoryManager::PrintState(this->_odata);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
// Return a view object that may be dereferenced in site loops.
|
||||
// The view is trivially copy constructible and may be copied to an accelerator device
|
||||
|
@ -576,7 +576,8 @@ class ScidacReader : public GridLimeReader {
|
||||
std::string rec_name(ILDG_BINARY_DATA);
|
||||
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
|
||||
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
|
||||
skipPastObjectRecord(std::string(GRID_FIELD_NORM));
|
||||
// in principle should do the line below, but that breaks backard compatibility with old data
|
||||
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
|
||||
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
|
||||
return;
|
||||
}
|
||||
|
@ -828,6 +828,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
|
||||
#if (!defined(GRID_HIP))
|
||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
////////////////////////////////////////////////
|
||||
// GENERAL CAYLEY CASE
|
||||
////////////////////////////////////////////////
|
||||
@ -880,7 +881,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
}
|
||||
|
||||
std::vector<RealD> G_s(Ls,1.0);
|
||||
RealD sign = 1; // sign flip for vector/tadpole
|
||||
RealD sign = 1.0; // sign flip for vector/tadpole
|
||||
if ( curr_type == Current::Axial ) {
|
||||
for(int s=0;s<Ls/2;s++){
|
||||
G_s[s] = -1.0;
|
||||
@ -890,7 +891,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
auto b=this->_b;
|
||||
auto c=this->_c;
|
||||
if ( b == 1 && c == 0 ) {
|
||||
sign = -1;
|
||||
sign = -1.0;
|
||||
}
|
||||
else {
|
||||
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
|
||||
@ -934,7 +935,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
tmp = Cshift(tmp,mu,-1);
|
||||
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
|
||||
tmp = -G_s[s]*( Utmp + gmu*Utmp );
|
||||
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
|
||||
// Mask the time
|
||||
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
|
||||
unsigned int t0 = 0;
|
||||
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
|
||||
} else {
|
||||
tmp = where((lcoor>=tmin+tshift),tmp,zz);
|
||||
}
|
||||
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
|
||||
|
||||
InsertSlice(L_Q, q_out, s , 0);
|
||||
|
@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
|
||||
class TypePair {
|
||||
public:
|
||||
T _internal[2];
|
||||
TypePair<T>& operator=(const Grid::Zero& o) {
|
||||
accelerator TypePair<T>& operator=(const Grid::Zero& o) {
|
||||
_internal[0] = Zero();
|
||||
_internal[1] = Zero();
|
||||
return *this;
|
||||
}
|
||||
|
||||
TypePair<T> operator+(const TypePair<T>& o) const {
|
||||
accelerator TypePair<T> operator+(const TypePair<T>& o) const {
|
||||
TypePair<T> r;
|
||||
r._internal[0] = _internal[0] + o._internal[0];
|
||||
r._internal[1] = _internal[1] + o._internal[1];
|
||||
return r;
|
||||
}
|
||||
|
||||
TypePair<T>& operator+=(const TypePair<T>& o) {
|
||||
accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
|
||||
_internal[0] += o._internal[0];
|
||||
_internal[1] += o._internal[1];
|
||||
return *this;
|
||||
|
@ -84,7 +84,8 @@ void acceleratorInit(void)
|
||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorCudaInit: using default device \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: assume user either uses\n");
|
||||
printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
|
||||
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
@ -109,6 +110,7 @@ void acceleratorInit(void)
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipDeviceProp_t *gpu_props;
|
||||
hipStream_t copyStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
@ -166,16 +168,25 @@ void acceleratorInit(void)
|
||||
#ifdef GRID_DEFAULT_GPU
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: using default device \n");
|
||||
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
|
||||
printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
|
||||
}
|
||||
int device = 0;
|
||||
#else
|
||||
if ( world_rank == 0 ) {
|
||||
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
|
||||
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
|
||||
printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
|
||||
}
|
||||
hipSetDevice(rank);
|
||||
int device = rank;
|
||||
#endif
|
||||
hipSetDevice(device);
|
||||
hipStreamCreate(©Stream);
|
||||
const int len=64;
|
||||
char busid[len];
|
||||
if( rank == world_rank ) {
|
||||
hipDeviceGetPCIBusId(busid, len, device);
|
||||
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
|
||||
}
|
||||
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
|
||||
}
|
||||
#endif
|
||||
|
@ -230,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
// int uvm=0;
|
||||
@ -337,6 +338,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#define accelerator __host__ __device__
|
||||
#define accelerator_inline __host__ __device__ inline
|
||||
|
||||
extern hipStream_t copyStream;
|
||||
/*These routines define mapping from thread grid to loop & vector lane indexing */
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
@ -411,10 +413,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
inline void acceleratorCopySynchronise(void) { }
|
||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
|
||||
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@ -485,18 +493,12 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
///////////////////////////////////////////////////
|
||||
// Synchronise across local threads for divergence resynch
|
||||
///////////////////////////////////////////////////
|
||||
accelerator_inline void acceleratorSynchronise(void)
|
||||
accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs
|
||||
{
|
||||
#ifdef GRID_SIMT
|
||||
#ifdef GRID_CUDA
|
||||
__syncwarp();
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
//cl::sycl::detail::workGroupBarrier();
|
||||
#endif
|
||||
#ifdef GRID_HIP
|
||||
__syncthreads();
|
||||
#endif
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
Reference in New Issue
Block a user