1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Compare commits

...

10 Commits

Author SHA1 Message Date
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
24 changed files with 254 additions and 211 deletions

View File

@ -215,7 +215,7 @@ public:
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else assert(0); else assert(0);
std::cout << "Making FFTW plan" << std::endl; std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
std::cout << "Making pencil" << std::endl; std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
} }
} }
std::cout << "Looping orthog" << std::endl; std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords // Loop over orthog coords
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
@ -274,7 +274,7 @@ public:
usec += timer.useconds(); usec += timer.useconds();
flops+= flops_call*NN; flops+= flops_call*NN;
std::cout << "Writing back results " << std::endl; std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result // writing out result
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
} }
result = result*div; result = result*div;
std::cout << "Destroying plan " << std::endl; std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif #endif

View File

@ -6,11 +6,10 @@ NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...) //#define dprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@ -168,7 +167,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
@ -183,7 +182,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@ -264,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,

View File

@ -55,6 +55,11 @@ public:
RealD alpha; // Mobius scale RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant RealD k; // EOFA normalization constant
// Device resident
deviceVector<Coeff_t> d_shift_coefficients;
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
virtual void Instantiatable(void) = 0; virtual void Instantiatable(void) = 0;
// EOFA-specific operations // EOFA-specific operations
@ -92,6 +97,11 @@ public:
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
d_shift_coefficients.resize(Ls);
d_MooeeInv_shift_lc.resize(Ls);
d_MooeeInv_shift_norm.resize(Ls);
}; };
}; };

View File

@ -143,6 +143,17 @@ public:
std::vector<Coeff_t> ueem; std::vector<Coeff_t> ueem;
std::vector<Coeff_t> dee; std::vector<Coeff_t> dee;
// Device memory
deviceVector<Coeff_t> d_diag;
deviceVector<Coeff_t> d_upper;
deviceVector<Coeff_t> d_lower;
deviceVector<Coeff_t> d_lee;
deviceVector<Coeff_t> d_dee;
deviceVector<Coeff_t> d_uee;
deviceVector<Coeff_t> d_leem;
deviceVector<Coeff_t> d_ueem;
// Matrices of 5d ee inverse params // Matrices of 5d ee inverse params
// std::vector<iSinglet<Simd> > MatpInv; // std::vector<iSinglet<Simd> > MatpInv;
// std::vector<iSinglet<Simd> > MatmInv; // std::vector<iSinglet<Simd> > MatmInv;

View File

@ -42,7 +42,7 @@ public:
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist); this->MomentumSpacePropagatorHw(out,in,_m,twist);
}; };
// Constructors // Constructors
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,

View File

@ -41,6 +41,10 @@ public:
public: public:
// Constructors // Constructors
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracTanhFermion(GaugeField &_Umu, OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,11 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
const int part_frac_chroma_convention=1; const int part_frac_chroma_convention=0;
void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Meooe_internal(const FermionField &in, FermionField &out,int dag);
void Mooee_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag);

View File

@ -414,29 +414,6 @@ public:
// surface_list.resize(0); // surface_list.resize(0);
this->same_node.resize(npoints); this->same_node.resize(npoints);
}; };
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
*/
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)

View File

@ -488,7 +488,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
leem.resize(Ls); leem.resize(Ls);
uee.resize(Ls); uee.resize(Ls);
ueem.resize(Ls); ueem.resize(Ls);
for(int i=0;i<Ls;i++){ for(int i=0;i<Ls;i++){
dee[i] = bee[i]; dee[i] = bee[i];
@ -529,6 +529,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;
} }
//////////////////////////////////////////
// Device buffers
//////////////////////////////////////////
d_diag.resize(Ls);
d_upper.resize(Ls);
d_lower.resize(Ls);
d_dee.resize(Ls);
d_lee.resize(Ls);
d_uee.resize(Ls);
d_leem.resize(Ls);
d_ueem.resize(Ls);
// int inv=1; // int inv=1;
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv); // this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); // this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);

View File

@ -57,9 +57,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
int Ls =this->Ls; int Ls =this->Ls;
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &d_upper[0];
@ -99,9 +99,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
int Ls=this->Ls; int Ls=this->Ls;
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &d_upper[0];
@ -134,11 +134,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
int Ls=this->Ls; int Ls=this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0]; auto plee = & d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & d_dee [0];
@ -196,11 +196,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
autoView(psi , psi_i,AcceleratorRead); autoView(psi , psi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite); autoView(chi , chi_i,AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0]; auto plee = & d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & d_dee [0];

View File

@ -51,13 +51,13 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
autoView( chi , chi_i, AcceleratorWrite); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
auto pdiag = &d_diag[0]; acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
auto pupper = &d_upper[0]; acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
auto plower = &d_lower[0]; acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -89,14 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
autoView( phi , phi_i, AcceleratorRead); autoView( phi , phi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &this->d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &this->d_upper[0];
auto plower = &d_lower[0]; auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -125,18 +125,18 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
autoView( chi, chi_i, AcceleratorWrite); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = & this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = & this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = & this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = & this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = & this->d_ueem[0];
auto plee = & d_lee [0];
auto pdee = & d_dee [0];
auto puee = & d_uee [0];
auto pleem = & d_leem[0];
auto pueem = & d_ueem[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
uint64_t nloop=grid->oSites()/Ls; uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;

View File

@ -50,14 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -93,15 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); auto pshift_coeffs = &this->d_shift_coefficients[0];
auto pdiag = &d_diag[0]; acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
auto pupper = &d_upper[0]; acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
auto plower = &d_lower[0]; acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
auto pshift_coeffs = &d_shift_coeffs[0]; acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
@ -138,14 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &this->d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &this->d_upper[0];
auto plower = &d_lower[0]; auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
@ -180,16 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); auto pshift_coeffs = &this->d_shift_coefficients[0];
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
auto pshift_coeffs = &d_shift_coeffs[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
auto pm = this->pm; auto pm = this->pm;
@ -230,17 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
autoView(psi , psi_i, AcceleratorRead); autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = & this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = & this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = & this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = & this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = & this->d_ueem[0];
auto plee = & d_lee [0]; acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pdee = & d_dee [0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto puee = & d_uee [0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pleem = & d_leem[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
auto pueem = & d_ueem[0]; acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
@ -293,23 +293,22 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
// Move into object and constructor // Move into object and constructor
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto pm = this->pm; auto pm = this->pm;
auto plee = & d_lee [0]; auto plee = & this->d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & this->d_dee [0];
auto puee = & d_uee [0]; auto puee = & this->d_uee [0];
auto pleem = & d_leem[0]; auto pleem = & this->d_leem[0];
auto pueem = & d_ueem[0]; auto pueem = & this->d_ueem[0];
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -367,17 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
autoView(psi , psi_i, AcceleratorRead); autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = &this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = &this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = &this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = &this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = &this->d_ueem[0];
auto plee = & d_lee [0]; acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pdee = & d_dee [0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto puee = & d_uee [0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pleem = & d_leem[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
auto pueem = & d_ueem[0]; acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -426,25 +425,23 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto pm = this->pm; auto pm = this->pm;
auto plee = & d_lee [0]; auto plee = & this->d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & this->d_dee [0];
auto puee = & d_uee [0]; auto puee = & this->d_uee [0];
auto pleem = & d_leem[0]; auto pleem = & this->d_leem[0];
auto pueem = & d_ueem[0]; auto pueem = & this->d_ueem[0];
static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls); auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls); auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

View File

@ -411,17 +411,18 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
int Ls = this->Ls; int Ls = this->Ls;
conformable(solution5d.Grid(),this->FermionGrid()); conformable(solution5d.Grid(),this->FermionGrid());
conformable(exported4d.Grid(),this->GaugeGrid()); conformable(exported4d.Grid(),this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); ExtractSlice(exported4d, solution5d, Ls-1, 0);
} }
template<class Impl> template<class Impl>
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{ {
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
int Ls = this->Ls; int Ls = this->Ls;
conformable(imported5d.Grid(),this->FermionGrid()); conformable(imported5d.Grid(),this->FermionGrid());
conformable(input4d.Grid() ,this->GaugeGrid()); conformable(input4d.Grid() ,this->GaugeGrid());
FermionField tmp(this->FermionGrid()); FermionField tmp(this->FermionGrid());
tmp=Zero(); tmp=Zero();
InsertSlice(input4d, tmp, Ls-1, Ls-1); InsertSlice(input4d, tmp, Ls-1, 0);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d); this->Dminus(tmp,imported5d);
} }

View File

@ -40,6 +40,11 @@ public:
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
using Action<GaugeField>::S;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
using Action<GaugeField>::refresh;
private: private:
RealD c_plaq; RealD c_plaq;
RealD c_rect; RealD c_rect;

View File

@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Map a su2 subgroup number to the pair of rows that are non zero // Map a su2 subgroup number to the pair of rows that are non zero
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
int spare = su2_index; int spare = su2_index;

View File

@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
// Map a su2 subgroup number to the pair of rows that are non zero // Map a su2 subgroup number to the pair of rows that are non zero
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
template <ONLY_IF_Sp> template <ONLY_IF_Sp>
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
const int nsp=ncolour/2; const int nsp=ncolour/2;
assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));

View File

@ -121,17 +121,22 @@ class CartesianStencilAccelerator {
StencilVector same_node; StencilVector same_node;
Coordinate _simd_layout; Coordinate _simd_layout;
Parameters parameters; Parameters parameters;
ViewMode mode;
StencilEntry* _entries_p; StencilEntry* _entries_p;
StencilEntry* _entries_host_p;
cobj* u_recv_buf_p; cobj* u_recv_buf_p;
cobj* u_send_buf_p; cobj* u_send_buf_p;
accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }
accelerator_inline int GetNodeLocal(int osite,int point) const { // Not a device function
return this->_entries_p[point+this->_npoints*osite]._is_local; inline int GetNodeLocal(int osite,int point) const {
StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
return SE._is_local;
} }
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; ptype = this->_permute_type[point];
return & this->_entries_p[point+this->_npoints*osite];
} }
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
@ -164,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
{ {
public: public:
int *closed; int *closed;
StencilEntry *cpu_ptr; // StencilEntry *cpu_ptr;
ViewMode mode;
public: public:
// default copy constructor // default copy constructor
CartesianStencilView (const CartesianStencilView &refer_to_me) = default; CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode) CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
cpu_ptr(this->_entries_p),
mode(_mode)
{ {
this->_entries_p =(StencilEntry *) this->ViewOpen(_mode);
MemoryManager::ViewOpen(this->_entries_p, }
this->_npoints*this->_osites*sizeof(StencilEntry), void ViewOpen(ViewMode _mode)
mode, {
AdviseDefault); this->mode = _mode;
} }
void ViewClose(void) void ViewClose(void) { }
{
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
}; };
@ -274,8 +273,8 @@ public:
std::vector<deviceVector<std::pair<int,int> > > face_table ; std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list; deviceVector<int> surface_list;
std::vector<StencilEntry> _entries; // Resident in host memory std::vector<StencilEntry> _entries; // Resident in host memory
deviceVector<StencilEntry> _entries_device; // Resident in device memory deviceVector<StencilEntry> _entries_device; // Resident in device memory
std::vector<Packet> Packets; std::vector<Packet> Packets;
std::vector<Merge> Mergers; std::vector<Merge> Mergers;
std::vector<Merge> MergersSHM; std::vector<Merge> MergersSHM;
@ -626,10 +625,10 @@ public:
//////////////////////////////////////// ////////////////////////////////////////
void PrecomputeByteOffsets(void){ void PrecomputeByteOffsets(void){
for(int i=0;i<_entries.size();i++){ for(int i=0;i<_entries.size();i++){
if( _entries[i]._is_local ) { if( this->_entries[i]._is_local ) {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
} else { } else {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
} }
} }
}; };
@ -764,7 +763,13 @@ public:
this->_osites = _grid->oSites(); this->_osites = _grid->oSites();
_entries.resize(this->_npoints* this->_osites); _entries.resize(this->_npoints* this->_osites);
this->_entries_p = &_entries[0]; _entries_device.resize(this->_npoints* this->_osites);
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){ for(int ii=0;ii<npoints;ii++){
int i = ii; // reverse direction to get SIMD comms done first int i = ii; // reverse direction to get SIMD comms done first
@ -841,6 +846,7 @@ public:
u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
} }
PrecomputeByteOffsets(); PrecomputeByteOffsets();
acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
} }
void Local (int point, int dimension,int shiftpm,int cbmask) void Local (int point, int dimension,int shiftpm,int cbmask)
@ -996,10 +1002,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(lo+o+b)*this->_npoints; int idx=point+(lo+o+b)*this->_npoints;
_entries[idx]._offset =ro+o+b; this->_entries[idx]._offset =ro+o+b;
_entries[idx]._permute=permute; this->_entries[idx]._permute=permute;
_entries[idx]._is_local=1; this->_entries[idx]._is_local=1;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -1017,10 +1023,10 @@ public:
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
int idx = point+(lo+o+b)*this->_npoints; int idx = point+(lo+o+b)*this->_npoints;
_entries[idx]._offset =ro+o+b; this->_entries[idx]._offset =ro+o+b;
_entries[idx]._is_local=1; this->_entries[idx]._is_local=1;
_entries[idx]._permute=permute; this->_entries[idx]._permute=permute;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
} }
@ -1044,10 +1050,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(so+o+b)*this->_npoints; int idx=point+(so+o+b)*this->_npoints;
_entries[idx]._offset =offset+(bo++); this->_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0; this->_entries[idx]._is_local=0;
_entries[idx]._permute=0; this->_entries[idx]._permute=0;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -1064,10 +1070,10 @@ public:
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
int idx = point+(so+o+b)*this->_npoints; int idx = point+(so+o+b)*this->_npoints;
_entries[idx]._offset =offset+(bo++); this->_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0; this->_entries[idx]._is_local=0;
_entries[idx]._permute =0; this->_entries[idx]._permute =0;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];

View File

@ -39,7 +39,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;
@ -279,6 +279,7 @@ int main (int argc, char ** argv)
result5 = result5 - Kinetic; result5 = result5 - Kinetic;
std::cout<<"diff "<< norm2(result5)<<std::endl; std::cout<<"diff "<< norm2(result5)<<std::endl;
assert(norm2(result5)<1.0e-4);
} }
@ -357,6 +358,7 @@ int main (int argc, char ** argv)
diff = ref - result4; diff = ref - result4;
std::cout << "result - ref "<<norm2(diff)<<std::endl; std::cout << "result - ref "<<norm2(diff)<<std::endl;
assert(norm2(diff)<1.0e-4);
} }
@ -440,6 +442,7 @@ int main (int argc, char ** argv)
diff = ref - result4; diff = ref - result4;
std::cout << "result - ref "<<norm2(diff)<<std::endl; std::cout << "result - ref "<<norm2(diff)<<std::endl;
assert(norm2(diff)<1.0e-4);
} }

View File

@ -38,7 +38,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;
@ -74,7 +74,7 @@ int main (int argc, char ** argv)
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n"; std::cout << "Testing OverlapWilsonPartialFractionZolotarevFermionD Hw kernel Mom space 4d propagator \n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
// LatticeFermionD src(&GRID); gaussian(pRNG,src); // LatticeFermionD src(&GRID); gaussian(pRNG,src);
@ -122,6 +122,7 @@ int main (int argc, char ** argv)
MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov); MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov);
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
CG(HermOp,src5,result5); CG(HermOp,src5,result5);
std::cout << " Solved by Conjugate Gradient (CGNE)" <<std::endl;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Domain wall physical field propagator // Domain wall physical field propagator
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -153,7 +154,7 @@ int main (int argc, char ** argv)
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing Dov(Hw) Mom space 4d propagator \n"; std::cout << "Testing OverlapWilsonCayleyTanhFermionD space 4d propagator \n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
LatticeFermionD tmp(&GRID); LatticeFermionD tmp(&GRID);
@ -228,7 +229,7 @@ int main (int argc, char ** argv)
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n"; std::cout<<"Testing OverlapWilsonPartialFractionZolotarevFermionD Hw kernel Mom space 4d propagator with q\n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
// LatticeFermionD src(&GRID); gaussian(pRNG,src); // LatticeFermionD src(&GRID); gaussian(pRNG,src);

View File

@ -39,7 +39,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
// Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1});
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;